Home / OCR / Docling / How-To Guides
How-To Problem-oriented

Docling How-To Guides

Step-by-step solutions for specific tasks: OCR configuration, table extraction, RAG integration, and performance optimization.

How to Configure OCR Engines

Docling supports multiple OCR backends. Choose based on your needs:

EasyOCR (Multi-language)

from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = EasyOcrOptions(
    lang=["en", "fr", "de"],  # Languages to detect
    use_gpu=True,
    confidence_threshold=0.5
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

Install: pip install "docling[easyocr]"

Tesseract (System OCR)

from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions()

# Requires system Tesseract installation:
# macOS: brew install tesseract
# Ubuntu: apt-get install tesseract-ocr

Install: pip install "docling[tesserocr]"

RapidOCR (Default, ONNX-based)

from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from huggingface_hub import snapshot_download
import os

# Download custom models from HuggingFace
download_path = snapshot_download(repo_id="SWHL/RapidOCR")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = RapidOcrOptions(
    det_model_path=os.path.join(download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"),
    rec_model_path=os.path.join(download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"),
)

How to Process Scanned PDFs

Force full-page OCR for scanned documents where text extraction fails:

from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Force OCR on every page (for scanned documents)
pipeline_options.ocr_options = EasyOcrOptions(
    force_full_page_ocr=True
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

result = converter.convert("scanned_document.pdf")
print(result.document.export_to_markdown())

How to Extract Tables to CSV/Excel

from docling.document_converter import DocumentConverter
import pandas as pd

converter = DocumentConverter()
result = converter.convert("report.pdf")

# Extract all tables
for i, table in enumerate(result.document.tables):
    # To DataFrame
    df = table.export_to_dataframe()

    # Save as CSV
    df.to_csv(f"table_{i}.csv", index=False)

    # Save as Excel
    df.to_excel(f"table_{i}.xlsx", index=False)

    # Or get as Markdown
    print(table.export_to_markdown())

How to Extract Invoice Data

Convert invoice to structured data, then use an LLM to extract fields:

from docling.document_converter import DocumentConverter
from openai import OpenAI
import json

# Step 1: Convert PDF to Markdown
converter = DocumentConverter()
result = converter.convert("invoice.pdf")
markdown = result.document.export_to_markdown()

# Step 2: Use LLM to extract structured data
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-5",  # or your preferred model
    messages=[{
        "role": "user",
        "content": f"""Extract invoice data as JSON:
{"invoice_number": str, "date": str, "vendor": str, "total": float,
 "line_items": [{"description": str, "qty": int, "price": float}]}

Document:
{markdown}"""
    }],
    response_format={"type": "json_object"}
)

invoice_data = json.loads(response.choices[0].message.content)
print(json.dumps(invoice_data, indent=2))

How to Enable GPU Acceleration

NVIDIA CUDA

# Install with CUDA support
pip install docling[cuda]

# Use vLLM for fast inference
from docling.datamodel import vlm_model_specs

pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.GRANITEDOCLING_VLLM
)

Apple Silicon (M1/M2/M3)

# Install with MLX support
pip install "docling[vlm]" mlx

# Use MLX-optimized model
from docling.datamodel import vlm_model_specs

pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.GRANITEDOCLING_MLX
)

How to Integrate with LangChain

from langchain_community.document_loaders import DoclingLoader

# Load documents using Docling
loader = DoclingLoader(file_path="document.pdf")
documents = loader.load()

# Use with LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
chunks = text_splitter.split_documents(documents)

# Create vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)

# Query
results = vectorstore.similarity_search("What is the total revenue?")

How to Integrate with LlamaIndex

from llama_index.readers.docling import DoclingReader
from llama_index.core import VectorStoreIndex

# Load with Docling reader
reader = DoclingReader()
documents = reader.load_data(file_path="document.pdf")

# Create index
index = VectorStoreIndex.from_documents(documents)

# Query
query_engine = index.as_query_engine()
response = query_engine.query("Summarize the key findings")
print(response)

How to Handle Large Documents

For documents with hundreds of pages, process in batches:

from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
import gc

# Configure for memory efficiency
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 0.5  # Reduce image resolution

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

# Process large document
result = converter.convert("large_document.pdf")

# Stream output to avoid memory issues
with open("output.md", "w") as f:
    f.write(result.document.export_to_markdown())

# Clean up
del result
gc.collect()