Part 6: OCR and Image Processing

6 June 2025 · netologist · 7 min, 1279 words · #a2a #llm #mcp #ollama #pydantic-ai #python #rag #vuejs
Why OCR and Image Processing?

Visual document understanding enables:
Document digitization: Convert paper documents to searchable text
Screenshot analysis: Extract text from screen captures
Handwriting recognition: Process handwritten notes
Multi-modal interaction: Understand visual context in conversations
OCR Implementation with Tesseract

# vision/ocr_processor.py
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np
import logging
from typing import List, Dict, Any, Optional, Tuple
import tempfile
import os
from pathlib import Path

class OCRProcessor:
    def __init__(self):
        # Configure Tesseract (adjust path as needed)
        # pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Linux
        # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows
        
        # Supported image formats
        self.supported_formats = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
        
        # OCR configuration for different document types
        self.ocr_configs = {
            'default': '--oem 3 --psm 6',
            'single_block': '--oem 3 --psm 6',
            'single_line': '--oem 3 --psm 7',
            'single_word': '--oem 3 --psm 8',
            'digits_only': '--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789',
            'handwritten': '--oem 3 --psm 6',
        }
    
    def preprocess_image(self, image: Image.Image, enhancement_type: str = 'default') -> Image.Image:
        """Preprocess image for better OCR accuracy"""
        
        # Convert to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Apply different preprocessing based on type
        if enhancement_type == 'scan':
            # For scanned documents
            image = self._enhance_scanned_document(image)
        elif enhancement_type == 'photo':
            # For photos of documents
            image = self._enhance_photo_document(image)
        elif enhancement_type == 'screenshot':
            # For screenshots
            image = self._enhance_screenshot(image)
        else:
            # Default enhancement
            image = self._default_enhancement(image)
        
        return image
    
    def _enhance_scanned_document(self, image: Image.Image) -> Image.Image:
        """Enhance scanned documents"""
        
        # Convert to OpenCV format
        cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        
        # Convert to grayscale
        gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply Gaussian blur to reduce noise
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        
        # Apply threshold to get binary image
        _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Remove noise with morphological operations
        kernel = np.ones((1, 1), np.uint8)
        cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
        
        # Convert back to PIL
        return Image.fromarray(cleaned)
    
    def _enhance_photo_document(self, image: Image.Image) -> Image.Image:
        """Enhance photos of documents"""
        
        # Convert to OpenCV format
        cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        
        # Convert to grayscale
        gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply adaptive threshold
        adaptive_thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        
        # Apply dilation and erosion to connect text components
        kernel = np.ones((2, 2), np.uint8)
        processed = cv2.morphologyEx(adaptive_thresh, cv2.MORPH_CLOSE, kernel)
        
        return Image.fromarray(processed)
    
    def _enhance_screenshot(self, image: Image.Image) -> Image.Image:
        """Enhance screenshots"""
        
        # Screenshots are usually already high quality, minimal processing needed
        enhancer = ImageEnhance.Contrast(image)
        enhanced = enhancer.enhance(1.2)
        
        enhancer = ImageEnhance.Sharpness(enhanced)
        enhanced = enhancer.enhance(1.1)
        
        return enhanced
    
    def _default_enhancement(self, image: Image.Image) -> Image.Image:
        """Default image enhancement"""
        
        # Enhance contrast
        enhancer = ImageEnhance.Contrast(image)
        enhanced = enhancer.enhance(1.2)
        
        # Enhance sharpness
        enhancer = ImageEnhance.Sharpness(enhanced)
        enhanced = enhancer.enhance(1.1)
        
        # Apply slight blur to reduce noise
        enhanced = enhanced.filter(ImageFilter.MedianFilter(size=3))
        
        return enhanced
    
    def extract_text(
        self, 
        image_path: str, 
        config_type: str = 'default',
        language: str = 'eng',
        enhancement_type: str = 'default'
    ) -> Dict[str, Any]:
        """Extract text from image"""
        
        try:
            # Load image
            image = Image.open(image_path)
            
            # Preprocess image
            processed_image = self.preprocess_image(image, enhancement_type)
            
            # Get OCR configuration
            config = self.ocr_configs.get(config_type, self.ocr_configs['default'])
            config += f' -l {language}'
            
            # Extract text
            text = pytesseract.image_to_string(processed_image, config=config)
            
            # Get detailed information
            data = pytesseract.image_to_data(processed_image, config=config, output_type=pytesseract.Output.DICT)
            
            # Calculate confidence scores
            confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0
            
            # Extract word-level information
            words = []
            for i in range(len(data['text'])):
                if int(data['conf'][i]) > 30:  # Only include words with decent confidence
                    words.append({
                        'text': data['text'][i],
                        'confidence': int(data['conf'][i]),
                        'bbox': {
                            'x': data['left'][i],
                            'y': data['top'][i],
                            'width': data['width'][i],
                            'height': data['height'][i]
                        }
                    })
            
            return {
                'text': text.strip(),
                'confidence': avg_confidence,
                'word_count': len([w for w in words if w['text'].strip()]),
                'words': words,
                'image_size': image.size,
                'processing_config': config_type
            }
            
        except Exception as e:
            logging.error(f"OCR extraction failed: {e}")
            return {
                'text': '',
                'confidence': 0,
                'error': str(e)
            }
    
    def extract_text_from_regions(
        self, 
        image_path: str, 
        regions: List[Tuple[int, int, int, int]],
        config_type: str = 'default'
    ) -> List[Dict[str, Any]]:
        """Extract text from specific regions of an image"""
        
        results = []
        
        try:
            image = Image.open(image_path)
            
            for i, (x, y, width, height) in enumerate(regions):
                # Crop region
                region = image.crop((x, y, x + width, y + height))
                
                # Save to temporary file
                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
                    region.save(temp_file.name)
                    
                    # Extract text from region
                    result = self.extract_text(temp_file.name, config_type)
                    result['region_id'] = i
                    result['bbox'] = {'x': x, 'y': y, 'width': width, 'height': height}
                    results.append(result)
                    
                    # Clean up
                    os.unlink(temp_file.name)
                    
        except Exception as e:
            logging.error(f"Region OCR failed: {e}")
            results.append({'error': str(e), 'region_id': -1})
        
        return results
    
    def detect_document_structure(self, image_path: str) -> Dict[str, Any]:
        """Detect document structure (headers, paragraphs, etc.)"""
        
        try:
            # Get detailed OCR data
            image = Image.open(image_path)
            processed_image = self.preprocess_image(image)
            
            data = pytesseract.image_to_data(
                processed_image, 
                config='--oem 3 --psm 6',
                output_type=pytesseract.Output.DICT
            )
            
            # Group text by blocks and paragraphs
            blocks = {}
            paragraphs = {}
            
            for i in range(len(data['text'])):
                if int(data['conf'][i]) > 30:
                    block_num = data['block_num'][i]
                    par_num = data['par_num'][i]
                    
                    # Group by blocks
                    if block_num not in blocks:
                        blocks[block_num] = []
                    blocks[block_num].append({
                        'text': data['text'][i],
                        'bbox': {
                            'x': data['left'][i],
                            'y': data['top'][i],
                            'width': data['width'][i],
                            'height': data['height'][i]
                        }
                    })
                    
                    # Group by paragraphs
                    par_key = f"{block_num}_{par_num}"
                    if par_key not in paragraphs:
                        paragraphs[par_key] = []
                    paragraphs[par_key].append({
                        'text': data['text'][i],
                        'bbox': {
                            'x': data['left'][i],
                            'y': data['top'][i],
                            'width': data['width'][i],
                            'height': data['height'][i]
                        }
                    })
            
            # Reconstruct structured text
            structured_text = []
            for block_id in sorted(blocks.keys()):
                block_text = ' '.join([item['text'] for item in blocks[block_id] if item['text'].strip()])
                if block_text.strip():
                    structured_text.append({
                        'type': 'block',
                        'id': block_id,
                        'text': block_text.strip()
                    })
            
            return {
                'blocks': blocks,
                'paragraphs': paragraphs,
                'structured_text': structured_text,
                'total_blocks': len(blocks),
                'total_paragraphs': len(paragraphs)
            }
            
        except Exception as e:
            logging.error(f"Document structure detection failed: {e}")
            return {'error': str(e)}

# Vision-Language Integration
class VisionLanguageProcessor:
    """Integrate OCR with LLM for intelligent document understanding"""
    
    def __init__(self, ollama_manager):
        self.ocr = OCRProcessor()
        self.ollama_manager = ollama_manager
    
    async def analyze_document(self, image_path: str) -> Dict[str, Any]:
        """Analyze document with OCR + LLM"""
        
        # Extract text with OCR
        ocr_result = self.ocr.extract_text(image_path, enhancement_type='scan')
        
        if not ocr_result['text']:
            return {'error': 'No text found in image'}
        
        # Analyze with LLM
        analysis_prompt = f"""Analyze this document text and provide a structured summary:

Text:
{ocr_result['text']}

Please provide:
1. Document type (letter, invoice, report, etc.)
2. Key information extracted
3. Important dates, numbers, or entities
4. Summary of main content
5. Any action items or requirements

Format your response as structured information."""

        analysis = await self.ollama_manager.generate_response(
            analysis_prompt,
            system_prompt="You are an expert document analyst. Provide clear, structured analysis of documents."
        )
        
        return {
            'ocr_result': ocr_result,
            'analysis': analysis,
            'confidence': ocr_result.get('confidence', 0),
            'word_count': ocr_result.get('word_count', 0)
        }
    
    async def answer_document_questions(self, image_path: str, question: str) -> str:
        """Answer questions about document content"""
        
        # Extract text
        ocr_result = self.ocr.extract_text(image_path)
        
        if not ocr_result['text']:
            return "I couldn't extract any text from this image."
        
        # Answer question based on document content
        qa_prompt = f"""Based on the following document text, answer the user's question.

Document text:
{ocr_result['text']}

User question: {question}

Provide a clear, accurate answer based only on the information in the document. If the information is not available, say so clearly."""

        answer = await self.ollama_manager.generate_response(
            qa_prompt,
            system_prompt="You are a helpful assistant that answers questions based on document content."
        )
        
        return answer

# Usage example
async def demo_ocr_processing():
    from models.ollama_manager import OllamaManager
    
    # Initialize processors
    ocr = OCRProcessor()
    ollama = OllamaManager()
    vision_processor = VisionLanguageProcessor(ollama)
    
    # Example: Process a document image
    image_path = "sample_document.jpg"  # Replace with actual image path
    
    if os.path.exists(image_path):
        # Basic OCR
        ocr_result = ocr.extract_text(image_path, enhancement_type='scan')
        print(f"Extracted text: {ocr_result['text'][:200]}...")
        print(f"Confidence: {ocr_result['confidence']:.2f}%")
        
        # Document analysis
        analysis = await vision_processor.analyze_document(image_path)
        print(f"Document analysis: {analysis['analysis']}")
        
        # Question answering
        question = "What is the main topic of this document?"
        answer = await vision_processor.answer_document_questions(image_path, question)
        print(f"Q: {question}")
        print(f"A: {answer}")
    else:
        print("Sample image not found. Please provide a document image to test.")

if __name__ == "__main__":
    asyncio.run(demo_ocr_processing())
Why this OCR architecture?
Multiple enhancement strategies: Different preprocessing for different document types
Confidence scoring: Quality assessment of OCR results
Structured extraction: Beyond plain text to document structure
LLM integration: Intelligent document analysis and Q&A
Regional processing: Extract text from specific areas
Error handling: Graceful degradation on processing failures