Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Parse a PDF with Docling and print Markdown output."""

from pathlib import Path

from docling.document_converter import DocumentConverter

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
converter = DocumentConverter()
result = converter.convert(PDF_PATH)

markdown = result.document.export_to_markdown()
print(markdown[:3000])
print("\n---\n")
print(f"Pages parsed: {len(result.document.pages)}")
print(f"Tables found: {len(result.document.tables)}")


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/docling_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Export Docling parse results to Markdown, JSON, HTML, and pandas DataFrames."""

import json
from pathlib import Path

from docling.document_converter import DocumentConverter

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
converter = DocumentConverter()
document = converter.convert(PDF_PATH).document

markdown = document.export_to_markdown()
Path("output_docling.md").write_text(markdown, encoding="utf-8")

payload = document.export_to_dict()
Path("output_docling.json").write_text(
json.dumps(payload, indent=2),
encoding="utf-8",
)

html = document.export_to_html()
Path("output_docling.html").write_text(html, encoding="utf-8")

for index, table in enumerate(document.tables):
frame = table.export_to_dataframe(doc=document)
print(f"Table {index} shape: {frame.shape}")
print(frame.head(), end="\n\n")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Parse a PDF with LlamaParse (llama-cloud SDK) and print Markdown output."""

import os
from pathlib import Path

from llama_cloud import LlamaCloud

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])

uploaded = client.files.create(file=PDF_PATH, purpose="parse")
result = client.parsing.parse(
file_id=uploaded.id,
tier="agentic",
version="latest",
expand=["markdown"],
)

pages = ""
for page in result.markdown.pages:
pages += page.markdown
pages += "\n---\n"

print(pages[:3000])
print(f"Pages parsed: {len(result.markdown.pages)}")

if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Export LlamaParse results to Markdown, Text, and schema-driven JSON."""

import json
import os
from pathlib import Path

from llama_cloud import LlamaCloud
from pydantic import BaseModel, Field

PDF_PATH = Path("sample_report.pdf")


class RevenueRow(BaseModel):
quarter: str = Field(description="Fiscal quarter label, e.g. Q1 2024")
revenue_millions: float = Field(description="Revenue in millions of USD")
growth_percent: float | None = Field(
default=None,
description="Year-over-year growth percentage if stated",
)


class RevenueTable(BaseModel):
rows: list[RevenueRow] = Field(description="One row per quarter in the table")


def main() -> None:
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])

uploaded = client.files.create(file=PDF_PATH, purpose="parse")

parsed = client.parsing.parse(
file_id=uploaded.id,
tier="agentic",
version="latest",
expand=["markdown", "text"],
)

markdown_pages = "\n\n".join(page.markdown for page in parsed.markdown.pages)
Path("output_llamaparse.md").write_text(markdown_pages, encoding="utf-8")

if parsed.text and parsed.text.pages:
text_pages = "\n".join(page.text for page in parsed.text.pages)
Path("output_llamaparse.text").write_text(text_pages, encoding="utf-8")

extract_file = client.files.create(file=PDF_PATH, purpose="extract")
job = client.extract.run(
file_input=extract_file.id,
configuration={
"data_schema": RevenueTable.model_json_schema(),
"extraction_target": "per_doc",
"tier": "agentic",
},
)

Path("output_llamaparse.json").write_text(
json.dumps(job.extract_result, indent=2),
encoding="utf-8",
)
print(json.dumps(job.extract_result, indent=2))


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
docling==2.102.2
llama-cloud>=2.9.0
pandas>=2.0.0
pydantic>=2.0.0
Binary file not shown.
Loading