Spaces:

wendys-llc
/

pdfplumber-demo

Runtime error

App Files Files Community

pdfplumber-demo / app.py

wendys-llc

change names

5d265c2 over 1 year ago

raw

history blame contribute delete

12.5 kB

	import gradio as gr
	import pdfplumber
	import textwrap
	import pprint
	import json
	import os
	from pathlib import Path


	def table_debugger(
	file_obj,
	page_num=0,
	table_num=0,
	crop_x0=None,
	crop_top=None,
	crop_x1=None,
	crop_bottom=None,
	vertical_strategy=None,
	horizontal_strategy=None,
	explicit_vertical_lines=None,
	explicit_horizontal_lines=None,
	snap_tolerance=None,
	snap_x_tolerance=None,
	snap_y_tolerance=None,
	join_tolerance=None,
	join_x_tolerance=None,
	join_y_tolerance=None,
	text_tolerance=None,
	text_x_tolerance=None,
	text_y_tolerance=None,
	intersection_tolerance=None,
	intersection_x_tolerance=None,
	intersection_y_tolerance=None,
	edge_min_length=None,
	min_words_vertical=None,
	min_words_horizontal=None,
	keep_blank_chars=None,
	):
	table_settings = {
	"vertical_strategy": vertical_strategy,
	"horizontal_strategy": horizontal_strategy,
	"explicit_vertical_lines": json.loads(explicit_vertical_lines)
	if explicit_vertical_lines
	else None,
	"explicit_horizontal_lines": json.loads(explicit_horizontal_lines)
	if explicit_horizontal_lines
	else None,
	"snap_tolerance": snap_tolerance,
	"snap_x_tolerance": snap_x_tolerance,
	"snap_y_tolerance": snap_y_tolerance,
	"join_tolerance": join_tolerance,
	"join_x_tolerance": join_x_tolerance,
	"join_y_tolerance": join_y_tolerance,
	"text_tolerance": text_tolerance,
	"text_x_tolerance": text_x_tolerance,
	"text_y_tolerance": text_y_tolerance,
	"intersection_tolerance": intersection_tolerance,
	"intersection_x_tolerance": intersection_x_tolerance,
	"intersection_y_tolerance": intersection_y_tolerance,
	"edge_min_length": edge_min_length,
	"min_words_vertical": min_words_vertical,
	"min_words_horizontal": min_words_horizontal,
	#'keep_blank_chars': keep_blank_chars
	}

	keys = list(table_settings.keys())
	for key in keys:
	if (
	table_settings[key] == ""
	or table_settings[key] == []
	or table_settings[key] is None
	):
	del table_settings[key]
	elif table_settings[key].isdigit():
	table_settings[key] = int(table_settings[key])

	table_num = int(table_num)

	with pdfplumber.open(file_obj.name) as pdf:
	page_num = int(page_num)
	page = pdf.pages[page_num]
	page_width = int(page.width)
	page_height = int(page.height)

	crop_x0 = int(crop_x0) if crop_x0 else 0
	crop_top = int(crop_top) if crop_top else 0
	crop_x1 = int(crop_x1) if crop_x1 else page_width
	crop_bottom = int(crop_bottom) if crop_bottom else page_height

	# Allow negative numbers
	if crop_bottom < 0:
	crop_bottom = page_height + crop_bottom
	if crop_x1 < 0:
	crop_x1 = page_width + crop_x1

	is_cropped = (
	crop_x0 != 0
	or crop_top != 0
	or crop_x1 != page_width
	or crop_bottom != page_height
	)

	# Only crop if we need to!
	if is_cropped:
	page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom))

	tables = page.extract_tables(table_settings)
	if len(tables) > 0:
	table = tables[0]
	else:
	table = None
	visual = page.to_image().debug_tablefinder(table_settings).annotated

	base_filename = file_obj.name.split("/")[-1]

	notes = f"""
	- Filename: {base_filename}
	- Pages: {len(pdf.pages)}
	- Page num {int(page_num)}:
	- Full dimensions: {page_width} x {page_height}
	- Crop: {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}
	- Tables found: {len(tables)}

	```python
	import pdfplumber
	pdf = pdfplumber.open("{base_filename}")
	page = pdf.pages[{page_num}]
	""".strip()

	if is_cropped:
	notes += (
	f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))"
	)

	notes += f"""\n
	table_settings = {pprint.pformat(table_settings, indent=8).strip()}
	tables = page.extract_tables(table_settings)
	table = tables[{table_num}]
	```"""

	notes = textwrap.dedent(notes)

	return [notes, visual, table]


	def demo_subset(
	file_obj,
	page_num,
	table_num,
	vertical_strategy,
	horizontal_strategy,
	snap_y_tolerance,
	intersection_x_tolerance,
	crop_bottom,
	):
	return table_debugger(
	file_obj,
	page_num=page_num,
	table_num=table_num,
	vertical_strategy=vertical_strategy,
	horizontal_strategy=horizontal_strategy,
	snap_y_tolerance=snap_y_tolerance,
	intersection_x_tolerance=intersection_x_tolerance,
	crop_bottom=crop_bottom,
	)


	notes = gr.Markdown()
	output_image = gr.Image()
	data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data')

	crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False)
	crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False)
	crop_x1 = gr.Text(
	label="Crop (x1)", placeholder="right (from page left)", container=False, render=False
	)
	crop_bottom = gr.Text(
	label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False
	)

	vertical_strategy = gr.Dropdown(
	label="Vertical Strategy",
	choices=["lines", "lines_strict", "text", "explicit"],
	render=False,
	value="lines",
	)
	horizontal_strategy = gr.Dropdown(
	label="Horizontal Strategy",
	choices=["lines", "lines_strict", "text", "explicit"],
	render=False,
	value="lines",
	)
	explicit_vertical_lines = gr.Textbox(
	label="explicit_vertical_lines", render=False, placeholder="[]"
	)
	explicit_horizontal_lines = gr.Textbox(
	label="explicit_horizontal_lines", render=False, placeholder="[]"
	)
	snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False)
	snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False)
	snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False)
	join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False)
	join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
	join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
	text_tolerance = gr.Textbox(
	label="Text tolerance", placeholder="1", render=False, value=None
	)
	text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False)
	text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False)
	intersection_tolerance = gr.Textbox(
	label="Intersection tolerance", placeholder="1", render=False
	)
	intersection_x_tolerance = gr.Textbox(
	label="Intersection tolerance (x)", placeholder="1", render=False
	)
	intersection_y_tolerance = gr.Textbox(
	label="Intersection tolerance (y)", placeholder="1", render=False
	)
	edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False)
	min_words_vertical = gr.Textbox(
	label="min_words_vertical", placeholder="3", render=False
	)
	min_words_horizontal = gr.Textbox(
	label="min_words_horizontal", placeholder="1", render=False
	)
	keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False)

	file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False)
	page_num = gr.Number(
	label="Page number", value=0, info="It's an index: first is 0!", render=False
	)
	table_num = gr.Number(
	label="Table number", value=0, info="It's an index: first is 0!", render=False
	)

	example_dir = Path(os.path.dirname(__file__)).joinpath("examples")

	examples = [
	[str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None],
	[
	str(example_dir.joinpath("museums.pdf")),
	2,
	0,
	"lines",
	"lines",
	None,
	None,
	None,
	],
	[
	str(example_dir.joinpath("background-checks.pdf")),
	0,
	0,
	"text",
	"text",
	5,
	15,
	487,
	],
	]

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# pdfplumber table extraction playground

	[pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. Scroll down for examples and lots more settings!

	YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/).
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	file.render()
	with gr.Accordion("Table details", open=True):
	with gr.Group():
	with gr.Row():
	page_num.render()
	table_num.render()

	with gr.Row():
	vertical_strategy.render()
	horizontal_strategy.render()

	with gr.Accordion("Crop", open=True):
	with gr.Group():
	crop_top.render()
	with gr.Row():
	crop_x0.render()
	crop_x1.render()
	crop_bottom.render()

	btn = gr.Button(value="Run")
	btn.click(
	table_debugger,
	inputs=[
	file,
	page_num,
	table_num,
	crop_x0,
	crop_top,
	crop_x1,
	crop_bottom,
	vertical_strategy,
	horizontal_strategy,
	explicit_vertical_lines,
	explicit_horizontal_lines,
	snap_tolerance,
	snap_x_tolerance,
	snap_y_tolerance,
	join_tolerance,
	join_x_tolerance,
	join_y_tolerance,
	text_tolerance,
	text_x_tolerance,
	text_y_tolerance,
	intersection_tolerance,
	intersection_x_tolerance,
	intersection_y_tolerance,
	edge_min_length,
	min_words_vertical,
	min_words_horizontal,
	keep_blank_chars,
	],
	outputs=[notes, output_image, data_table],
	)

	notes.render()

	with gr.Column(scale=3):
	data_table.render()
	output_image.render()

	gr.Examples(
	examples=examples,
	inputs=[
	file,
	page_num,
	table_num,
	vertical_strategy,
	horizontal_strategy,
	snap_y_tolerance,
	intersection_x_tolerance,
	crop_bottom,
	],
	outputs=[notes, output_image, data_table],
	fn=demo_subset,
	run_on_click=True,
	)

	gr.Markdown("## Additional options")
	with gr.Row():
	with gr.Column():
	with gr.Group():
	snap_tolerance.render()
	with gr.Row():
	snap_x_tolerance.render()
	snap_y_tolerance.render()
	join_tolerance.render()
	with gr.Row():
	join_x_tolerance.render()
	join_y_tolerance.render()
	text_tolerance.render()
	with gr.Row():
	text_x_tolerance.render()
	text_y_tolerance.render()
	intersection_tolerance.render()
	with gr.Row():
	intersection_x_tolerance.render()
	intersection_y_tolerance.render()

	with gr.Column():
	with gr.Group():
	explicit_vertical_lines.render()
	explicit_horizontal_lines.render()
	edge_min_length.render()
	with gr.Row():
	min_words_vertical.render()
	min_words_horizontal.render()
	keep_blank_chars.render()

	if __name__ == "__main__":
	demo.launch()