Part 2: Local LLM with Ollama Integration
Why Ollama?
Ollama provides several advantages:
- Privacy: Your data never leaves your machine
- Speed: No network latency for inference
- Cost: No API fees for unlimited usage
- Customization: Fine-tune models for your specific needs
Setting Up Ollama Models
# models/ollama_manager.py
import ollama
from typing import AsyncGenerator, Dict, Any, Optional
import asyncio
import logging
from config import config
class OllamaManager:
def __init__(self):
self.client = ollama.AsyncClient(host=config.OLLAMA_BASE_URL)
self.models_cache = {}
async def ensure_model_available(self, model_name: str) -> bool:
"""Ensure a model is pulled and available"""
try:
# Check if model exists locally
models = await self.client.list()
model_names = [m['name'] for m in models['models']]
if model_name not in model_names:
logging.info(f"Pulling model {model_name}...")
await self.client.pull(model_name)
logging.info(f"Model {model_name} pulled successfully")
return True
except Exception as e:
logging.error(f"Failed to ensure model {model_name}: {e}")
return False
async def generate_response(
self,
prompt: str,
model: Optional[str] = None,
system_prompt: Optional[str] = None,
stream: bool = False
) -> AsyncGenerator[str, None] | str:
"""Generate response with specified model"""
model = model or config.PRIMARY_MODEL
await self.ensure_model_available(model)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
try:
if stream:
async for chunk in await self.client.chat(
model=model,
messages=messages,
stream=True
):
if chunk['message']['content']:
yield chunk['message']['content']
else:
response = await self.client.chat(
model=model,
messages=messages
)
return response['message']['content']
except Exception as e:
logging.error(f"Generation failed: {e}")
return "I apologize, but I encountered an error generating a response."
async def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
"""Generate embeddings for texts"""
await self.ensure_model_available(config.EMBEDDING_MODEL)
embeddings = []
for text in texts:
try:
response = await self.client.embeddings(
model=config.EMBEDDING_MODEL,
prompt=text
)
embeddings.append(response['embedding'])
except Exception as e:
logging.error(f"Embedding generation failed for text: {e}")
embeddings.append([0.0] * 768) # Fallback zero embedding
return embeddings
# Usage example
async def main():
manager = OllamaManager()
# Ensure our models are available
await manager.ensure_model_available(config.PRIMARY_MODEL)
await manager.ensure_model_available(config.EMBEDDING_MODEL)
# Generate a response
response = await manager.generate_response(
"Explain why local AI models are important for privacy",
system_prompt="You are a helpful AI assistant focused on privacy and security."
)
print(response)
if __name__ == "__main__":
asyncio.run(main())
Why this architecture?
- Async operations: Prevents blocking during model loading/inference
- Model caching: Avoids repeated model pulls
- Error handling: Graceful degradation when models fail
- Flexible interface: Supports both streaming and non-streaming responses