Part 2: Local LLM with Ollama Integration

2 June 2025 · netologist · 2 min, 353 words · #a2a #llm #mcp #ollama #pydantic-ai #python #rag #vuejs

Why Ollama?

Ollama provides several advantages:

Privacy: Your data never leaves your machine
Speed: No network latency for inference
Cost: No API fees for unlimited usage
Customization: Fine-tune models for your specific needs

Setting Up Ollama Models

# models/ollama_manager.py
import ollama
from typing import AsyncGenerator, Dict, Any, Optional
import asyncio
import logging
from config import config

class OllamaManager:
    def __init__(self):
        self.client = ollama.AsyncClient(host=config.OLLAMA_BASE_URL)
        self.models_cache = {}
        
    async def ensure_model_available(self, model_name: str) -> bool:
        """Ensure a model is pulled and available"""
        try:
            # Check if model exists locally
            models = await self.client.list()
            model_names = [m['name'] for m in models['models']]
            
            if model_name not in model_names:
                logging.info(f"Pulling model {model_name}...")
                await self.client.pull(model_name)
                logging.info(f"Model {model_name} pulled successfully")
            
            return True
        except Exception as e:
            logging.error(f"Failed to ensure model {model_name}: {e}")
            return False
    
    async def generate_response(
        self, 
        prompt: str, 
        model: Optional[str] = None,
        system_prompt: Optional[str] = None,
        stream: bool = False
    ) -> AsyncGenerator[str, None] | str:
        """Generate response with specified model"""
        model = model or config.PRIMARY_MODEL
        
        await self.ensure_model_available(model)
        
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})
        
        try:
            if stream:
                async for chunk in await self.client.chat(
                    model=model,
                    messages=messages,
                    stream=True
                ):
                    if chunk['message']['content']:
                        yield chunk['message']['content']
            else:
                response = await self.client.chat(
                    model=model,
                    messages=messages
                )
                return response['message']['content']
                
        except Exception as e:
            logging.error(f"Generation failed: {e}")
            return "I apologize, but I encountered an error generating a response."
    
    async def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for texts"""
        await self.ensure_model_available(config.EMBEDDING_MODEL)
        
        embeddings = []
        for text in texts:
            try:
                response = await self.client.embeddings(
                    model=config.EMBEDDING_MODEL,
                    prompt=text
                )
                embeddings.append(response['embedding'])
            except Exception as e:
                logging.error(f"Embedding generation failed for text: {e}")
                embeddings.append([0.0] * 768)  # Fallback zero embedding
        
        return embeddings

# Usage example
async def main():
    manager = OllamaManager()
    
    # Ensure our models are available
    await manager.ensure_model_available(config.PRIMARY_MODEL)
    await manager.ensure_model_available(config.EMBEDDING_MODEL)
    
    # Generate a response
    response = await manager.generate_response(
        "Explain why local AI models are important for privacy",
        system_prompt="You are a helpful AI assistant focused on privacy and security."
    )
    print(response)

if __name__ == "__main__":
    asyncio.run(main())

Why this architecture?

Async operations: Prevents blocking during model loading/inference
Model caching: Avoids repeated model pulls
Error handling: Graceful degradation when models fail
Flexible interface: Supports both streaming and non-streaming responses