In this hands-on guide, we'll build a complete RAG pipeline from scratch. You'll create a system that ingests PDF documents, indexes them in a vector database, and answers questions with citations. We'll use LangChain, ChromaDB, and Claude.
Prerequisites & Setup
# Install dependencies
pip install langchain langchain-anthropic langchain-chroma
pip install pypdf sentence-transformers chromadb
pip install langchain-community python-dotenv
# .env file
ANTHROPIC_API_KEY=your_key_here
Step 1: Document Loading
LangChain provides loaders for dozens of document types. We'll support PDFs and web pages:
from langchain_community.document_loaders import (
PyPDFLoader,
WebBaseLoader,
DirectoryLoader
)
from pathlib import Path
def load_documents(source: str) -> list:
"""Load documents from a file path, directory, or URL."""
if source.startswith("http"):
loader = WebBaseLoader(source)
elif Path(source).is_dir():
loader = DirectoryLoader(source, glob="**/*.pdf",
loader_cls=PyPDFLoader)
elif source.endswith(".pdf"):
loader = PyPDFLoader(source)
else:
raise ValueError(f"Unsupported source: {source}")
documents = loader.load()
print(f"Loaded {len(documents)} document(s)")
return documents
# Example
docs = load_documents("./company_docs/")
# or
docs = load_documents("https://docs.anthropic.com/claude")
Step 2: Chunking Strategy
Chunking is critical and often underestimated. Chunk size and overlap significantly affect retrieval quality.
from langchain.text_splitter import RecursiveCharacterTextSplitter
# RecursiveCharacterTextSplitter tries to split on natural boundaries
# (paragraphs โ sentences โ words) before splitting mid-word
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # ~250 tokens (rough estimate: 4 chars/token)
chunk_overlap=200, # 20% overlap helps preserve context at boundaries
separators=["\n\n", "\n", ". ", " ", ""],
length_function=len,
)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")
# Inspect a chunk
print(chunks[5].page_content[:200])
print(chunks[5].metadata) # Contains source, page number, etc.
Chunking tips: 512โ1024 characters works well for most docs. Use semantic chunking (split at paragraph/section boundaries) when possible. Add 10โ20% overlap to avoid splitting related sentences. For code, chunk at function boundaries instead.
Step 3: Embedding & Indexing
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
# Use a local embedding model (free, fast, good quality)
embeddings = HuggingFaceEmbeddings(
model_name="BAAI/bge-small-en-v1.5", # 33M params, 384 dims
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
)
# Or use OpenAI embeddings (better quality, costs money)
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
PERSIST_DIR = "./chroma_db"
# Create or load the vector store
if os.path.exists(PERSIST_DIR):
vectorstore = Chroma(
persist_directory=PERSIST_DIR,
embedding_function=embeddings
)
print(f"Loaded existing vectorstore with {vectorstore._collection.count()} chunks")
else:
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=PERSIST_DIR
)
print(f"Created vectorstore with {len(chunks)} chunks")
Step 4: Building the Retriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_anthropic import ChatAnthropic
# Basic retriever โ returns top-4 most similar chunks
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 4}
)
# Better: MMR retriever โ maximizes relevance AND diversity
# (avoids returning 4 nearly-identical chunks)
mmr_retriever = vectorstore.as_retriever(
search_type="mmr", # Maximum Marginal Relevance
search_kwargs={
"k": 4,
"fetch_k": 20, # Fetch 20, re-rank to top 4
"lambda_mult": 0.5 # 0=max diversity, 1=max relevance
}
)
# Test the retriever
test_query = "What is the refund policy?"
results = retriever.invoke(test_query)
for i, doc in enumerate(results):
print(f"Result {i+1}: {doc.page_content[:100]}...")
print(f" Source: {doc.metadata.get('source', 'unknown')}\n")
Step 5: Building the RAG Chain
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Initialize the LLM
llm = ChatAnthropic(
model="claude-opus-4-6",
max_tokens=1024,
temperature=0
)
# RAG prompt template
RAG_PROMPT = ChatPromptTemplate.from_template("""
You are a helpful assistant. Answer the question based ONLY on the provided context.
If the answer is not in the context, say "I don't have information about that in the provided documents."
Always cite the source document when referencing specific information.
Context:
{context}
Question: {question}
Answer:""")
def format_docs(docs):
"""Format retrieved docs into a single context string with source labels."""
formatted = []
for i, doc in enumerate(docs):
source = doc.metadata.get('source', f'Document {i+1}')
formatted.append(f"[Source: {source}]\n{doc.page_content}")
return "\n\n---\n\n".join(formatted)
# Build the chain using LangChain Expression Language (LCEL)
rag_chain = (
{
"context": mmr_retriever | format_docs,
"question": RunnablePassthrough()
}
| RAG_PROMPT
| llm
| StrOutputParser()
)
# Query the chain
response = rag_chain.invoke("What is the company's remote work policy?")
print(response)
Step 6: Adding Streaming & Memory
from langchain_core.messages import HumanMessage, AIMessage
class ConversationalRAG:
def __init__(self, retriever, llm):
self.retriever = retriever
self.llm = llm
self.chat_history = []
def chat(self, question: str) -> str:
# Include chat history in context for follow-up questions
history_str = "\n".join([
f"Human: {msg.content}" if isinstance(msg, HumanMessage)
else f"Assistant: {msg.content}"
for msg in self.chat_history[-6:] # Last 3 turns
])
# Contextualize the question with history
contextualize_prompt = f"""Given this conversation history:
{history_str}
Rephrase the following question to be self-contained (if needed):
Question: {question}
Rephrased:"""
if self.chat_history:
standalone_q = self.llm.invoke(contextualize_prompt).content
else:
standalone_q = question
# Retrieve and generate
docs = self.retriever.invoke(standalone_q)
context = format_docs(docs)
response = self.llm.invoke(
RAG_PROMPT.format(context=context, question=question)
).content
# Update history
self.chat_history.extend([
HumanMessage(content=question),
AIMessage(content=response)
])
return response
# Use it
bot = ConversationalRAG(mmr_retriever, llm)
print(bot.chat("What is the refund policy?"))
print(bot.chat("How long does it take?")) # Follow-up question
Step 7: Evaluating RAG Quality
RAG systems need evaluation. Use RAGAS โ a framework for RAG-specific metrics:
from ragas import evaluate
from ragas.metrics import (
faithfulness, # Is the answer grounded in the retrieved docs?
answer_relevancy, # Is the answer relevant to the question?
context_precision, # Are retrieved docs actually relevant?
context_recall, # Were all relevant docs retrieved?
)
# Prepare evaluation dataset
eval_data = {
"question": ["What is the refund policy?", "How to cancel subscription?"],
"answer": [bot.chat(q) for q in questions],
"contexts": [[doc.page_content for doc in retriever.invoke(q)] for q in questions],
"ground_truth": ["30-day money back guarantee", "Cancel in account settings"]
}
results = evaluate(
dataset=eval_data,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)
print(results)
Common Issues & Fixes
- Poor retrieval quality โ Try MMR, tune chunk size, add metadata filters, use better embedding model
- Answers not grounded in docs โ Strengthen system prompt, lower temperature, use faithfulness evaluation
- Missing context at chunk boundaries โ Increase overlap (200โ300 chars), use semantic chunking
- Slow indexing โ Batch embedding calls, use GPU, persist the vectorstore
- Stale knowledge โ Implement incremental indexing โ only re-embed changed documents
Production tip: Start simple (basic similarity search, 1000-char chunks) and evaluate before optimizing. Most RAG improvements come from better chunking and retrieval quality, not a fancier architecture.
Key Takeaways
- A complete RAG pipeline: load โ chunk โ embed โ store โ retrieve โ generate
- MMR retrieval beats plain similarity for diverse, relevant results
- Format retrieved docs clearly with source labels in the prompt
- Add conversation history for follow-up question handling
- Evaluate with RAGAS metrics: faithfulness, relevancy, precision, recall
- Persist your vector store โ don't re-index on every restart