How-To Problem-oriented
Docling How-To Guides
Step-by-step solutions for specific tasks: OCR configuration, table extraction, RAG integration, and performance optimization.
How to Configure OCR Engines
Docling supports multiple OCR backends. Choose based on your needs:
EasyOCR (Multi-language)
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = EasyOcrOptions(
lang=["en", "fr", "de"], # Languages to detect
use_gpu=True,
confidence_threshold=0.5
)
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
) Install: pip install "docling[easyocr]"
Tesseract (System OCR)
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions()
# Requires system Tesseract installation:
# macOS: brew install tesseract
# Ubuntu: apt-get install tesseract-ocr Install: pip install "docling[tesserocr]"
RapidOCR (Default, ONNX-based)
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from huggingface_hub import snapshot_download
import os
# Download custom models from HuggingFace
download_path = snapshot_download(repo_id="SWHL/RapidOCR")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = RapidOcrOptions(
det_model_path=os.path.join(download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"),
rec_model_path=os.path.join(download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"),
) How to Process Scanned PDFs
Force full-page OCR for scanned documents where text extraction fails:
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Force OCR on every page (for scanned documents)
pipeline_options.ocr_options = EasyOcrOptions(
force_full_page_ocr=True
)
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
result = converter.convert("scanned_document.pdf")
print(result.document.export_to_markdown()) How to Extract Tables to CSV/Excel
from docling.document_converter import DocumentConverter
import pandas as pd
converter = DocumentConverter()
result = converter.convert("report.pdf")
# Extract all tables
for i, table in enumerate(result.document.tables):
# To DataFrame
df = table.export_to_dataframe()
# Save as CSV
df.to_csv(f"table_{i}.csv", index=False)
# Save as Excel
df.to_excel(f"table_{i}.xlsx", index=False)
# Or get as Markdown
print(table.export_to_markdown()) How to Extract Invoice Data
Convert invoice to structured data, then use an LLM to extract fields:
from docling.document_converter import DocumentConverter
from openai import OpenAI
import json
# Step 1: Convert PDF to Markdown
converter = DocumentConverter()
result = converter.convert("invoice.pdf")
markdown = result.document.export_to_markdown()
# Step 2: Use LLM to extract structured data
client = OpenAI()
response = client.chat.completions.create(
model="gpt-5", # or your preferred model
messages=[{
"role": "user",
"content": f"""Extract invoice data as JSON:
{"invoice_number": str, "date": str, "vendor": str, "total": float,
"line_items": [{"description": str, "qty": int, "price": float}]}
Document:
{markdown}"""
}],
response_format={"type": "json_object"}
)
invoice_data = json.loads(response.choices[0].message.content)
print(json.dumps(invoice_data, indent=2)) How to Enable GPU Acceleration
NVIDIA CUDA
# Install with CUDA support
pip install docling[cuda]
# Use vLLM for fast inference
from docling.datamodel import vlm_model_specs
pipeline_options = VlmPipelineOptions(
vlm_options=vlm_model_specs.GRANITEDOCLING_VLLM
) Apple Silicon (M1/M2/M3)
# Install with MLX support
pip install "docling[vlm]" mlx
# Use MLX-optimized model
from docling.datamodel import vlm_model_specs
pipeline_options = VlmPipelineOptions(
vlm_options=vlm_model_specs.GRANITEDOCLING_MLX
) How to Integrate with LangChain
from langchain_community.document_loaders import DoclingLoader
# Load documents using Docling
loader = DoclingLoader(file_path="document.pdf")
documents = loader.load()
# Use with LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
chunks = text_splitter.split_documents(documents)
# Create vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)
# Query
results = vectorstore.similarity_search("What is the total revenue?") How to Integrate with LlamaIndex
from llama_index.readers.docling import DoclingReader
from llama_index.core import VectorStoreIndex
# Load with Docling reader
reader = DoclingReader()
documents = reader.load_data(file_path="document.pdf")
# Create index
index = VectorStoreIndex.from_documents(documents)
# Query
query_engine = index.as_query_engine()
response = query_engine.query("Summarize the key findings")
print(response) How to Handle Large Documents
For documents with hundreds of pages, process in batches:
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
import gc
# Configure for memory efficiency
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 0.5 # Reduce image resolution
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
# Process large document
result = converter.convert("large_document.pdf")
# Stream output to avoid memory issues
with open("output.md", "w") as f:
f.write(result.document.export_to_markdown())
# Clean up
del result
gc.collect()