Part 6: OCR and Image Processing
Why OCR and Image Processing?
Visual document understanding enables:
- Document digitization: Convert paper documents to searchable text
- Screenshot analysis: Extract text from screen captures
- Handwriting recognition: Process handwritten notes
- Multi-modal interaction: Understand visual context in conversations
OCR Implementation with Tesseract
# vision/ocr_processor.py
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np
import logging
from typing import List, Dict, Any, Optional, Tuple
import tempfile
import os
from pathlib import Path
class OCRProcessor:
def __init__(self):
# Configure Tesseract (adjust path as needed)
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Linux
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Windows
# Supported image formats
self.supported_formats = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'}
# OCR configuration for different document types
self.ocr_configs = {
'default': '--oem 3 --psm 6',
'single_block': '--oem 3 --psm 6',
'single_line': '--oem 3 --psm 7',
'single_word': '--oem 3 --psm 8',
'digits_only': '--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789',
'handwritten': '--oem 3 --psm 6',
}
def preprocess_image(self, image: Image.Image, enhancement_type: str = 'default') -> Image.Image:
"""Preprocess image for better OCR accuracy"""
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
# Apply different preprocessing based on type
if enhancement_type == 'scan':
# For scanned documents
image = self._enhance_scanned_document(image)
elif enhancement_type == 'photo':
# For photos of documents
image = self._enhance_photo_document(image)
elif enhancement_type == 'screenshot':
# For screenshots
image = self._enhance_screenshot(image)
else:
# Default enhancement
image = self._default_enhancement(image)
return image
def _enhance_scanned_document(self, image: Image.Image) -> Image.Image:
"""Enhance scanned documents"""
# Convert to OpenCV format
cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Convert to grayscale
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
# Apply Gaussian blur to reduce noise
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# Apply threshold to get binary image
_, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Remove noise with morphological operations
kernel = np.ones((1, 1), np.uint8)
cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
# Convert back to PIL
return Image.fromarray(cleaned)
def _enhance_photo_document(self, image: Image.Image) -> Image.Image:
"""Enhance photos of documents"""
# Convert to OpenCV format
cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Convert to grayscale
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
# Apply adaptive threshold
adaptive_thresh = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
# Apply dilation and erosion to connect text components
kernel = np.ones((2, 2), np.uint8)
processed = cv2.morphologyEx(adaptive_thresh, cv2.MORPH_CLOSE, kernel)
return Image.fromarray(processed)
def _enhance_screenshot(self, image: Image.Image) -> Image.Image:
"""Enhance screenshots"""
# Screenshots are usually already high quality, minimal processing needed
enhancer = ImageEnhance.Contrast(image)
enhanced = enhancer.enhance(1.2)
enhancer = ImageEnhance.Sharpness(enhanced)
enhanced = enhancer.enhance(1.1)
return enhanced
def _default_enhancement(self, image: Image.Image) -> Image.Image:
"""Default image enhancement"""
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
enhanced = enhancer.enhance(1.2)
# Enhance sharpness
enhancer = ImageEnhance.Sharpness(enhanced)
enhanced = enhancer.enhance(1.1)
# Apply slight blur to reduce noise
enhanced = enhanced.filter(ImageFilter.MedianFilter(size=3))
return enhanced
def extract_text(
self,
image_path: str,
config_type: str = 'default',
language: str = 'eng',
enhancement_type: str = 'default'
) -> Dict[str, Any]:
"""Extract text from image"""
try:
# Load image
image = Image.open(image_path)
# Preprocess image
processed_image = self.preprocess_image(image, enhancement_type)
# Get OCR configuration
config = self.ocr_configs.get(config_type, self.ocr_configs['default'])
config += f' -l {language}'
# Extract text
text = pytesseract.image_to_string(processed_image, config=config)
# Get detailed information
data = pytesseract.image_to_data(processed_image, config=config, output_type=pytesseract.Output.DICT)
# Calculate confidence scores
confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
# Extract word-level information
words = []
for i in range(len(data['text'])):
if int(data['conf'][i]) > 30: # Only include words with decent confidence
words.append({
'text': data['text'][i],
'confidence': int(data['conf'][i]),
'bbox': {
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i]
}
})
return {
'text': text.strip(),
'confidence': avg_confidence,
'word_count': len([w for w in words if w['text'].strip()]),
'words': words,
'image_size': image.size,
'processing_config': config_type
}
except Exception as e:
logging.error(f"OCR extraction failed: {e}")
return {
'text': '',
'confidence': 0,
'error': str(e)
}
def extract_text_from_regions(
self,
image_path: str,
regions: List[Tuple[int, int, int, int]],
config_type: str = 'default'
) -> List[Dict[str, Any]]:
"""Extract text from specific regions of an image"""
results = []
try:
image = Image.open(image_path)
for i, (x, y, width, height) in enumerate(regions):
# Crop region
region = image.crop((x, y, x + width, y + height))
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
region.save(temp_file.name)
# Extract text from region
result = self.extract_text(temp_file.name, config_type)
result['region_id'] = i
result['bbox'] = {'x': x, 'y': y, 'width': width, 'height': height}
results.append(result)
# Clean up
os.unlink(temp_file.name)
except Exception as e:
logging.error(f"Region OCR failed: {e}")
results.append({'error': str(e), 'region_id': -1})
return results
def detect_document_structure(self, image_path: str) -> Dict[str, Any]:
"""Detect document structure (headers, paragraphs, etc.)"""
try:
# Get detailed OCR data
image = Image.open(image_path)
processed_image = self.preprocess_image(image)
data = pytesseract.image_to_data(
processed_image,
config='--oem 3 --psm 6',
output_type=pytesseract.Output.DICT
)
# Group text by blocks and paragraphs
blocks = {}
paragraphs = {}
for i in range(len(data['text'])):
if int(data['conf'][i]) > 30:
block_num = data['block_num'][i]
par_num = data['par_num'][i]
# Group by blocks
if block_num not in blocks:
blocks[block_num] = []
blocks[block_num].append({
'text': data['text'][i],
'bbox': {
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i]
}
})
# Group by paragraphs
par_key = f"{block_num}_{par_num}"
if par_key not in paragraphs:
paragraphs[par_key] = []
paragraphs[par_key].append({
'text': data['text'][i],
'bbox': {
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i]
}
})
# Reconstruct structured text
structured_text = []
for block_id in sorted(blocks.keys()):
block_text = ' '.join([item['text'] for item in blocks[block_id] if item['text'].strip()])
if block_text.strip():
structured_text.append({
'type': 'block',
'id': block_id,
'text': block_text.strip()
})
return {
'blocks': blocks,
'paragraphs': paragraphs,
'structured_text': structured_text,
'total_blocks': len(blocks),
'total_paragraphs': len(paragraphs)
}
except Exception as e:
logging.error(f"Document structure detection failed: {e}")
return {'error': str(e)}
# Vision-Language Integration
class VisionLanguageProcessor:
"""Integrate OCR with LLM for intelligent document understanding"""
def __init__(self, ollama_manager):
self.ocr = OCRProcessor()
self.ollama_manager = ollama_manager
async def analyze_document(self, image_path: str) -> Dict[str, Any]:
"""Analyze document with OCR + LLM"""
# Extract text with OCR
ocr_result = self.ocr.extract_text(image_path, enhancement_type='scan')
if not ocr_result['text']:
return {'error': 'No text found in image'}
# Analyze with LLM
analysis_prompt = f"""Analyze this document text and provide a structured summary:
Text:
{ocr_result['text']}
Please provide:
1. Document type (letter, invoice, report, etc.)
2. Key information extracted
3. Important dates, numbers, or entities
4. Summary of main content
5. Any action items or requirements
Format your response as structured information."""
analysis = await self.ollama_manager.generate_response(
analysis_prompt,
system_prompt="You are an expert document analyst. Provide clear, structured analysis of documents."
)
return {
'ocr_result': ocr_result,
'analysis': analysis,
'confidence': ocr_result.get('confidence', 0),
'word_count': ocr_result.get('word_count', 0)
}
async def answer_document_questions(self, image_path: str, question: str) -> str:
"""Answer questions about document content"""
# Extract text
ocr_result = self.ocr.extract_text(image_path)
if not ocr_result['text']:
return "I couldn't extract any text from this image."
# Answer question based on document content
qa_prompt = f"""Based on the following document text, answer the user's question.
Document text:
{ocr_result['text']}
User question: {question}
Provide a clear, accurate answer based only on the information in the document. If the information is not available, say so clearly."""
answer = await self.ollama_manager.generate_response(
qa_prompt,
system_prompt="You are a helpful assistant that answers questions based on document content."
)
return answer
# Usage example
async def demo_ocr_processing():
from models.ollama_manager import OllamaManager
# Initialize processors
ocr = OCRProcessor()
ollama = OllamaManager()
vision_processor = VisionLanguageProcessor(ollama)
# Example: Process a document image
image_path = "sample_document.jpg" # Replace with actual image path
if os.path.exists(image_path):
# Basic OCR
ocr_result = ocr.extract_text(image_path, enhancement_type='scan')
print(f"Extracted text: {ocr_result['text'][:200]}...")
print(f"Confidence: {ocr_result['confidence']:.2f}%")
# Document analysis
analysis = await vision_processor.analyze_document(image_path)
print(f"Document analysis: {analysis['analysis']}")
# Question answering
question = "What is the main topic of this document?"
answer = await vision_processor.answer_document_questions(image_path, question)
print(f"Q: {question}")
print(f"A: {answer}")
else:
print("Sample image not found. Please provide a document image to test.")
if __name__ == "__main__":
asyncio.run(demo_ocr_processing())
Why this OCR architecture?
- Multiple enhancement strategies: Different preprocessing for different document types
- Confidence scoring: Quality assessment of OCR results
- Structured extraction: Beyond plain text to document structure
- LLM integration: Intelligent document analysis and Q&A
- Regional processing: Extract text from specific areas
- Error handling: Graceful degradation on processing failures