rig/rig-core/examples/pdf_agent.rs

use anyhow::{Context, Result};
use rig::{
    embeddings::EmbeddingsBuilder, loaders::PdfFileLoader, providers::openai,
    vector_store::in_memory_store::InMemoryVectorStore, Embed,
};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;

#[derive(Embed, Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
struct Document {
    id: String,
    #[embed]
    content: String,
}

fn load_pdf(path: PathBuf) -> Result<Vec<String>> {
    const CHUNK_SIZE: usize = 2000;

    let content_chunks = PdfFileLoader::with_glob(path.to_str().context("Invalid path")?)?
        .read()
        .into_iter()
        .filter_map(|result| {
            result
                .map_err(|e| {
                    eprintln!("Error reading PDF content: {}", e);
                    e
                })
                .ok()
        })
        .flat_map(|content| {
            let mut chunks = Vec::new();
            let mut current = String::new();

            for word in content.split_whitespace() {
                if current.len() + word.len() + 1 > CHUNK_SIZE && !current.is_empty() {
                    chunks.push(std::mem::take(&mut current).trim().to_string());
                }
                current.push_str(word);
                current.push(' ');
            }

            if !current.is_empty() {
                chunks.push(current.trim().to_string());
            }

            chunks
        })
        .collect::<Vec<_>>();

    if content_chunks.is_empty() {
        anyhow::bail!("No content found in PDF file: {}", path.display());
    }

    Ok(content_chunks)
}

#[tokio::main]
async fn main() -> Result<()> {
    // Initialize Ollama client
    let client = openai::Client::from_url("ollama", "http://localhost:11434/v1");

    // Load PDFs using Rig's built-in PDF loader
    let documents_dir = std::env::current_dir()?.join("rig-core/examples/documents");

    let pdf_chunks =
        load_pdf(documents_dir.join("deepseek_r1.pdf")).context("Failed to load pdf documents")?;

    println!("Successfully loaded and chunked PDF documents");

    // Create embedding model
    let model = client.embedding_model("bge-m3");

    // Create embeddings builder
    let mut builder = EmbeddingsBuilder::new(model.clone());

    // Add chunks from pdf documents
    for (i, chunk) in pdf_chunks.into_iter().enumerate() {
        builder = builder.document(Document {
            id: format!("pdf_document_{}", i),
            content: chunk,
        })?;
    }

    // Build embeddings
    let embeddings = builder.build().await?;

    println!("Successfully generated embeddings");

    // Create vector store and index
    let vector_store = InMemoryVectorStore::from_documents(embeddings);
    let index = vector_store.index(model);

    println!("Successfully created vector store and index");

    // Create RAG agent
    let rag_agent = client
        .agent("deepseek-r1")
        .preamble("You are a helpful assistant that answers questions based on the provided document context. When answering questions, try to synthesize information from multiple chunks if they're related.")
        .dynamic_context(1, index)
        .build();

    println!("Starting CLI chatbot...");

    // Start interactive CLI
    rig::cli_chatbot::cli_chatbot(rag_agent).await?;

    Ok(())
}