pdfplumber-demo / app.py
wendys-llc's picture
change names
5d265c2
raw
history blame contribute delete
12.5 kB
import gradio as gr
import pdfplumber
import textwrap
import pprint
import json
import os
from pathlib import Path
def table_debugger(
file_obj,
page_num=0,
table_num=0,
crop_x0=None,
crop_top=None,
crop_x1=None,
crop_bottom=None,
vertical_strategy=None,
horizontal_strategy=None,
explicit_vertical_lines=None,
explicit_horizontal_lines=None,
snap_tolerance=None,
snap_x_tolerance=None,
snap_y_tolerance=None,
join_tolerance=None,
join_x_tolerance=None,
join_y_tolerance=None,
text_tolerance=None,
text_x_tolerance=None,
text_y_tolerance=None,
intersection_tolerance=None,
intersection_x_tolerance=None,
intersection_y_tolerance=None,
edge_min_length=None,
min_words_vertical=None,
min_words_horizontal=None,
keep_blank_chars=None,
):
table_settings = {
"vertical_strategy": vertical_strategy,
"horizontal_strategy": horizontal_strategy,
"explicit_vertical_lines": json.loads(explicit_vertical_lines)
if explicit_vertical_lines
else None,
"explicit_horizontal_lines": json.loads(explicit_horizontal_lines)
if explicit_horizontal_lines
else None,
"snap_tolerance": snap_tolerance,
"snap_x_tolerance": snap_x_tolerance,
"snap_y_tolerance": snap_y_tolerance,
"join_tolerance": join_tolerance,
"join_x_tolerance": join_x_tolerance,
"join_y_tolerance": join_y_tolerance,
"text_tolerance": text_tolerance,
"text_x_tolerance": text_x_tolerance,
"text_y_tolerance": text_y_tolerance,
"intersection_tolerance": intersection_tolerance,
"intersection_x_tolerance": intersection_x_tolerance,
"intersection_y_tolerance": intersection_y_tolerance,
"edge_min_length": edge_min_length,
"min_words_vertical": min_words_vertical,
"min_words_horizontal": min_words_horizontal,
#'keep_blank_chars': keep_blank_chars
}
keys = list(table_settings.keys())
for key in keys:
if (
table_settings[key] == ""
or table_settings[key] == []
or table_settings[key] is None
):
del table_settings[key]
elif table_settings[key].isdigit():
table_settings[key] = int(table_settings[key])
table_num = int(table_num)
with pdfplumber.open(file_obj.name) as pdf:
page_num = int(page_num)
page = pdf.pages[page_num]
page_width = int(page.width)
page_height = int(page.height)
crop_x0 = int(crop_x0) if crop_x0 else 0
crop_top = int(crop_top) if crop_top else 0
crop_x1 = int(crop_x1) if crop_x1 else page_width
crop_bottom = int(crop_bottom) if crop_bottom else page_height
# Allow negative numbers
if crop_bottom < 0:
crop_bottom = page_height + crop_bottom
if crop_x1 < 0:
crop_x1 = page_width + crop_x1
is_cropped = (
crop_x0 != 0
or crop_top != 0
or crop_x1 != page_width
or crop_bottom != page_height
)
# Only crop if we need to!
if is_cropped:
page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom))
tables = page.extract_tables(table_settings)
if len(tables) > 0:
table = tables[0]
else:
table = None
visual = page.to_image().debug_tablefinder(table_settings).annotated
base_filename = file_obj.name.split("/")[-1]
notes = f"""
- **Filename:** {base_filename}
- **Pages:** {len(pdf.pages)}
- **Page num {int(page_num)}:**
- **Full dimensions:** {page_width} x {page_height}
- **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}
- **Tables found:** {len(tables)}
```python
import pdfplumber
pdf = pdfplumber.open("{base_filename}")
page = pdf.pages[{page_num}]
""".strip()
if is_cropped:
notes += (
f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))"
)
notes += f"""\n
table_settings = {pprint.pformat(table_settings, indent=8).strip()}
tables = page.extract_tables(table_settings)
table = tables[{table_num}]
```"""
notes = textwrap.dedent(notes)
return [notes, visual, table]
def demo_subset(
file_obj,
page_num,
table_num,
vertical_strategy,
horizontal_strategy,
snap_y_tolerance,
intersection_x_tolerance,
crop_bottom,
):
return table_debugger(
file_obj,
page_num=page_num,
table_num=table_num,
vertical_strategy=vertical_strategy,
horizontal_strategy=horizontal_strategy,
snap_y_tolerance=snap_y_tolerance,
intersection_x_tolerance=intersection_x_tolerance,
crop_bottom=crop_bottom,
)
notes = gr.Markdown()
output_image = gr.Image()
data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data')
crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False)
crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False)
crop_x1 = gr.Text(
label="Crop (x1)", placeholder="right (from page left)", container=False, render=False
)
crop_bottom = gr.Text(
label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False
)
vertical_strategy = gr.Dropdown(
label="Vertical Strategy",
choices=["lines", "lines_strict", "text", "explicit"],
render=False,
value="lines",
)
horizontal_strategy = gr.Dropdown(
label="Horizontal Strategy",
choices=["lines", "lines_strict", "text", "explicit"],
render=False,
value="lines",
)
explicit_vertical_lines = gr.Textbox(
label="explicit_vertical_lines", render=False, placeholder="[]"
)
explicit_horizontal_lines = gr.Textbox(
label="explicit_horizontal_lines", render=False, placeholder="[]"
)
snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False)
snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False)
snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False)
join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False)
join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
text_tolerance = gr.Textbox(
label="Text tolerance", placeholder="1", render=False, value=None
)
text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False)
text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False)
intersection_tolerance = gr.Textbox(
label="Intersection tolerance", placeholder="1", render=False
)
intersection_x_tolerance = gr.Textbox(
label="Intersection tolerance (x)", placeholder="1", render=False
)
intersection_y_tolerance = gr.Textbox(
label="Intersection tolerance (y)", placeholder="1", render=False
)
edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False)
min_words_vertical = gr.Textbox(
label="min_words_vertical", placeholder="3", render=False
)
min_words_horizontal = gr.Textbox(
label="min_words_horizontal", placeholder="1", render=False
)
keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False)
file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False)
page_num = gr.Number(
label="Page number", value=0, info="It's an index: first is 0!", render=False
)
table_num = gr.Number(
label="Table number", value=0, info="It's an index: first is 0!", render=False
)
example_dir = Path(os.path.dirname(__file__)).joinpath("examples")
examples = [
[str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None],
[
str(example_dir.joinpath("museums.pdf")),
2,
0,
"lines",
"lines",
None,
None,
None,
],
[
str(example_dir.joinpath("background-checks.pdf")),
0,
0,
"text",
"text",
5,
15,
487,
],
]
with gr.Blocks() as demo:
gr.Markdown(
"""
# pdfplumber table extraction playground
[pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!**
YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/).
"""
)
with gr.Row():
with gr.Column(scale=2):
file.render()
with gr.Accordion("Table details", open=True):
with gr.Group():
with gr.Row():
page_num.render()
table_num.render()
with gr.Row():
vertical_strategy.render()
horizontal_strategy.render()
with gr.Accordion("Crop", open=True):
with gr.Group():
crop_top.render()
with gr.Row():
crop_x0.render()
crop_x1.render()
crop_bottom.render()
btn = gr.Button(value="Run")
btn.click(
table_debugger,
inputs=[
file,
page_num,
table_num,
crop_x0,
crop_top,
crop_x1,
crop_bottom,
vertical_strategy,
horizontal_strategy,
explicit_vertical_lines,
explicit_horizontal_lines,
snap_tolerance,
snap_x_tolerance,
snap_y_tolerance,
join_tolerance,
join_x_tolerance,
join_y_tolerance,
text_tolerance,
text_x_tolerance,
text_y_tolerance,
intersection_tolerance,
intersection_x_tolerance,
intersection_y_tolerance,
edge_min_length,
min_words_vertical,
min_words_horizontal,
keep_blank_chars,
],
outputs=[notes, output_image, data_table],
)
notes.render()
with gr.Column(scale=3):
data_table.render()
output_image.render()
gr.Examples(
examples=examples,
inputs=[
file,
page_num,
table_num,
vertical_strategy,
horizontal_strategy,
snap_y_tolerance,
intersection_x_tolerance,
crop_bottom,
],
outputs=[notes, output_image, data_table],
fn=demo_subset,
run_on_click=True,
)
gr.Markdown("## Additional options")
with gr.Row():
with gr.Column():
with gr.Group():
snap_tolerance.render()
with gr.Row():
snap_x_tolerance.render()
snap_y_tolerance.render()
join_tolerance.render()
with gr.Row():
join_x_tolerance.render()
join_y_tolerance.render()
text_tolerance.render()
with gr.Row():
text_x_tolerance.render()
text_y_tolerance.render()
intersection_tolerance.render()
with gr.Row():
intersection_x_tolerance.render()
intersection_y_tolerance.render()
with gr.Column():
with gr.Group():
explicit_vertical_lines.render()
explicit_horizontal_lines.render()
edge_min_length.render()
with gr.Row():
min_words_vertical.render()
min_words_horizontal.render()
keep_blank_chars.render()
if __name__ == "__main__":
demo.launch()