diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..e08cae4f4e98ccc30348b9eb4a8bef3e7521e137 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.json filter=lfs diff=lfs merge=lfs -text +*.jsonl filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b05e916a40698a0249e9074e33bcdd22fdb55767 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +title: PDF Extraction Comparisson +emoji: 📉 +colorFrom: green +colorTo: gray +sdk: gradio +sdk_version: 5.27.0 +app_file: extractor_compare.py +pinned: false \ No newline at end of file diff --git a/extraction/non_truncated/record_0.json b/extraction/non_truncated/record_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4c37ac383293d77cc4441c0ce7382a589c9d4f1c --- /dev/null +++ b/extraction/non_truncated/record_0.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167521da014ee1816adfa9667e045dc5835ebbfde0d4c036a1c354e15f51dbca +size 63882 diff --git a/extraction/non_truncated/record_1.json b/extraction/non_truncated/record_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a3ab4694e2e87197f97b67a723d3511783fb5817 --- /dev/null +++ b/extraction/non_truncated/record_1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0a96a2e7dc9833c93b2817f91ea2cdb46844bd8f216b34ec88bbb3b404183e +size 2355885 diff --git a/extraction/non_truncated/record_10.json b/extraction/non_truncated/record_10.json new file mode 100644 index 0000000000000000000000000000000000000000..2cd6b56d56a57acc12a5bfe3114e2c251bfe4285 --- /dev/null +++ b/extraction/non_truncated/record_10.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b4a9ac963d2795aef7a86d20944049d2554a7cf75dd94a58c17115dc581299 +size 1477192 diff --git a/extraction/non_truncated/record_11.json b/extraction/non_truncated/record_11.json new file mode 100644 index 0000000000000000000000000000000000000000..44ea6089d174860fd9fb9e7534c8ea11991f227b --- /dev/null +++ b/extraction/non_truncated/record_11.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf21c60728cba6f508dbf44c98cd4b968df2cce2bfcb05f0647762edf54f487 +size 68649 diff --git a/extraction/non_truncated/record_12.json b/extraction/non_truncated/record_12.json new file mode 100644 index 0000000000000000000000000000000000000000..c4f847ebd730aae5cd082006c772bb013e8fa253 --- /dev/null +++ b/extraction/non_truncated/record_12.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2d0f828fdf221ee601cc6904e599665b4ae12082e2b9d2a76c2f7d32528921 +size 58362 diff --git a/extraction/non_truncated/record_13.json b/extraction/non_truncated/record_13.json new file mode 100644 index 0000000000000000000000000000000000000000..1abe1c27b9d808dcc5bbe1b15154dfed49ba4920 --- /dev/null +++ b/extraction/non_truncated/record_13.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2182ed09edf5a0aa885f1ac28dddf070d96342c8606fd27e4315e44b39ade036 +size 263549 diff --git a/extraction/non_truncated/record_14.json b/extraction/non_truncated/record_14.json new file mode 100644 index 0000000000000000000000000000000000000000..ead1dcb8cb4afebf0545d38313c30b40c0efccb9 --- /dev/null +++ b/extraction/non_truncated/record_14.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3066f1b77f8d4190cf2928a09f883dc6b3f628cf257a9479d9bf822df465e389 +size 610239 diff --git a/extraction/non_truncated/record_15.json b/extraction/non_truncated/record_15.json new file mode 100644 index 0000000000000000000000000000000000000000..6ad6cb270ea3ff5f40473c950a9a94579eb59e20 --- /dev/null +++ b/extraction/non_truncated/record_15.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb949316ebfd20294ddd0c1e1ac73a748da4a3f1bfefc819f2c292b54e4ef55 +size 225940 diff --git a/extraction/non_truncated/record_16.json b/extraction/non_truncated/record_16.json new file mode 100644 index 0000000000000000000000000000000000000000..899f93eb060cd8ada9a57f818ba52a6dd86a0320 --- /dev/null +++ b/extraction/non_truncated/record_16.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f7bec9722662ec468459191a19077ccbdda6f4c4bd7156f0d3ba46908afed5 +size 207706 diff --git a/extraction/non_truncated/record_17.json b/extraction/non_truncated/record_17.json new file mode 100644 index 0000000000000000000000000000000000000000..a66a2c8d33838d582f00c02c8e36424c73369b4d --- /dev/null +++ b/extraction/non_truncated/record_17.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6eeee023c92b02ccc1dcb9d775e80eab03a82f78a062e266590427aa8cd5cd +size 151205 diff --git a/extraction/non_truncated/record_18.json b/extraction/non_truncated/record_18.json new file mode 100644 index 0000000000000000000000000000000000000000..a8fcf3a1726a24603d0e68188cc4dcbaefefb003 --- /dev/null +++ b/extraction/non_truncated/record_18.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca829e8020f472452f7c41ec63f3ddfcdd6dce0b04167c383c311e290129124 +size 730429 diff --git a/extraction/non_truncated/record_19.json b/extraction/non_truncated/record_19.json new file mode 100644 index 0000000000000000000000000000000000000000..85c2a9f24d3df17481c713e3cf65a1900c194b77 --- /dev/null +++ b/extraction/non_truncated/record_19.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5e819288bc5b404228abe38e4e07c966a13597f8cb1ae0336c1d58cebd0573 +size 4347952 diff --git a/extraction/non_truncated/record_2.json b/extraction/non_truncated/record_2.json new file mode 100644 index 0000000000000000000000000000000000000000..32f98892aaa35ee150b12726d6c23c0f2defadfe --- /dev/null +++ b/extraction/non_truncated/record_2.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cb541e4ed8038eb44dce3f74a40b2a748f09e3ce98168c95b1ae6959699e31 +size 200511 diff --git a/extraction/non_truncated/record_20.json b/extraction/non_truncated/record_20.json new file mode 100644 index 0000000000000000000000000000000000000000..869a740aaf4a8ff81e9be6632a714e05aa1822dc --- /dev/null +++ b/extraction/non_truncated/record_20.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3695a6b19de6b483e6f9f4ce417584d3bca274d1a18bb88f1fc3bc3a4e5a372 +size 34621 diff --git a/extraction/non_truncated/record_21.json b/extraction/non_truncated/record_21.json new file mode 100644 index 0000000000000000000000000000000000000000..65b518fdc714f22b964c9b9a631d5fd51b4c0f20 --- /dev/null +++ b/extraction/non_truncated/record_21.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60619c8ebe2caba7c9e2ea3304418a461bd351b90ec6a1a3a3af31fb92ef0196 +size 1403253 diff --git a/extraction/non_truncated/record_22.json b/extraction/non_truncated/record_22.json new file mode 100644 index 0000000000000000000000000000000000000000..569f6ac1e4e00252fdc63ab6399fc0307bcad136 --- /dev/null +++ b/extraction/non_truncated/record_22.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9939ba567c0e42fd17ecef79384c5125e56cb5bd53f2229b0115301d048ecc00 +size 505232 diff --git a/extraction/non_truncated/record_23.json b/extraction/non_truncated/record_23.json new file mode 100644 index 0000000000000000000000000000000000000000..6b337aeb0d4141faf8693c0482a765184900b61e --- /dev/null +++ b/extraction/non_truncated/record_23.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81be2481d68dd31f741082419de4dcfd8aa4575200d1fddad407f476a1dedd76 +size 889179 diff --git a/extraction/non_truncated/record_24.json b/extraction/non_truncated/record_24.json new file mode 100644 index 0000000000000000000000000000000000000000..c441f6383c4364e23132c535c9fedebd1a6fe3b6 --- /dev/null +++ b/extraction/non_truncated/record_24.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf0f8312ea71e37843d206374692ae41e92f42d442a4c5ec7599eb3ca59e0a88 +size 951925 diff --git a/extraction/non_truncated/record_25.json b/extraction/non_truncated/record_25.json new file mode 100644 index 0000000000000000000000000000000000000000..5e47396101f03ec11e5040e32c24d3624177bd9f --- /dev/null +++ b/extraction/non_truncated/record_25.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86828204d28fb8bbc5031f49548d8a9949767c58ffae242bd401a8996461475b +size 30260 diff --git a/extraction/non_truncated/record_26.json b/extraction/non_truncated/record_26.json new file mode 100644 index 0000000000000000000000000000000000000000..36fdea0db923881ec056b179f3891ba908614f90 --- /dev/null +++ b/extraction/non_truncated/record_26.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25e63f13b7efa1af614ed705b31a05ea32481e9e354964e1a207652b66a4e19 +size 189677 diff --git a/extraction/non_truncated/record_27.json b/extraction/non_truncated/record_27.json new file mode 100644 index 0000000000000000000000000000000000000000..5087b222a3f4ea47224d33e2a9c6d4a0e7dc8709 --- /dev/null +++ b/extraction/non_truncated/record_27.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7709908a0df97a54b8b99f423d646bb280904efa26c97b83989021aacf32cb5 +size 384357 diff --git a/extraction/non_truncated/record_28.json b/extraction/non_truncated/record_28.json new file mode 100644 index 0000000000000000000000000000000000000000..ab1e166fa0ab73c0fca7ba201cccd368bec11db3 --- /dev/null +++ b/extraction/non_truncated/record_28.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa2f0b2e981d2972a9c8a52234a5a736eb0f6c86cdb36c61e76b8580a4ec6583 +size 38877 diff --git a/extraction/non_truncated/record_29.json b/extraction/non_truncated/record_29.json new file mode 100644 index 0000000000000000000000000000000000000000..204ee9091493d885e963e8cd5731b0a9f6eae61c --- /dev/null +++ b/extraction/non_truncated/record_29.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad40b768f55f12542d29a4abb4c832188401046a13611671199350b4c95d921 +size 135635 diff --git a/extraction/non_truncated/record_3.json b/extraction/non_truncated/record_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dc6b12842d1e0a042a6db92b242801317af223a1 --- /dev/null +++ b/extraction/non_truncated/record_3.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1ff68c4d0a3f81aaa60d970767832f823c498286f8c149efeb25b9e6079648 +size 655199 diff --git a/extraction/non_truncated/record_30.json b/extraction/non_truncated/record_30.json new file mode 100644 index 0000000000000000000000000000000000000000..3b65913c2da0e90f573021294d697b4e9130107b --- /dev/null +++ b/extraction/non_truncated/record_30.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ac40be45da7450a9430f86b45eddb1f0913f0884ed84e67dff6a9d9eae0596 +size 196521 diff --git a/extraction/non_truncated/record_31.json b/extraction/non_truncated/record_31.json new file mode 100644 index 0000000000000000000000000000000000000000..ede2bc88082f353fa0f0e2f924444bbf40307897 --- /dev/null +++ b/extraction/non_truncated/record_31.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3120768e0f4f63acfb4ea6c504634c129d40179730e9a268170f4a4cdc1904d +size 885829 diff --git a/extraction/non_truncated/record_32.json b/extraction/non_truncated/record_32.json new file mode 100644 index 0000000000000000000000000000000000000000..df7a54e66e36e1a924f976d01ab91985a1d6c68f --- /dev/null +++ b/extraction/non_truncated/record_32.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f86e7ab93d134a1fa59bfe062ba6fc226a323f976012320ac61d9b3e56bde0b +size 372363 diff --git a/extraction/non_truncated/record_33.json b/extraction/non_truncated/record_33.json new file mode 100644 index 0000000000000000000000000000000000000000..648a4c5d50c3d4d7f07a9ccd6a083deff33ce7b8 --- /dev/null +++ b/extraction/non_truncated/record_33.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ecf8838485b6667b1362f39f5a0d7886b4a05ed61f80fccdf2f71b9fe9b98c7 +size 5082783 diff --git a/extraction/non_truncated/record_34.json b/extraction/non_truncated/record_34.json new file mode 100644 index 0000000000000000000000000000000000000000..5bab56f56adfde377c0d245c9e2aa9a8ef05ffbf --- /dev/null +++ b/extraction/non_truncated/record_34.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:395237b321ad926362bd520315d527184261f73cf031533aa170d2a36ef80409 +size 38339 diff --git a/extraction/non_truncated/record_35.json b/extraction/non_truncated/record_35.json new file mode 100644 index 0000000000000000000000000000000000000000..afb5c249c73ac06f0d5c2dcc68f314d8b33e7464 --- /dev/null +++ b/extraction/non_truncated/record_35.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45172f2b9c3975dc0fa102323d65c98606edaf0dfcfaf1e94c5075bb6ba58eb9 +size 363612 diff --git a/extraction/non_truncated/record_36.json b/extraction/non_truncated/record_36.json new file mode 100644 index 0000000000000000000000000000000000000000..3803304f084ce370e6e42547251b19ab56a8f328 --- /dev/null +++ b/extraction/non_truncated/record_36.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1838aa1feed4c9f70dd600276f4d83249ab8f046acc9d67a9028abacde224af1 +size 696226 diff --git a/extraction/non_truncated/record_37.json b/extraction/non_truncated/record_37.json new file mode 100644 index 0000000000000000000000000000000000000000..58d483d94be6ca4adc364cecb29e576bc032ab54 --- /dev/null +++ b/extraction/non_truncated/record_37.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eebf8813eba242eaf54427058dadd927c037074645fb6872b068f692e35d8cc +size 188427 diff --git a/extraction/non_truncated/record_38.json b/extraction/non_truncated/record_38.json new file mode 100644 index 0000000000000000000000000000000000000000..36e3bde5fbf7c06207f9b25e42d984c38b257ec2 --- /dev/null +++ b/extraction/non_truncated/record_38.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90ff6c48476635f96802b94a776605f887bcbc305b23e1b4c579c3f8ae567b9 +size 858917 diff --git a/extraction/non_truncated/record_39.json b/extraction/non_truncated/record_39.json new file mode 100644 index 0000000000000000000000000000000000000000..a922ee7d115b2e502f6d591e0d1b9c5b7c47b745 --- /dev/null +++ b/extraction/non_truncated/record_39.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0859d7ced21f7147b79de2e68798076063f3630379e1bd62097750191658d2ba +size 231498 diff --git a/extraction/non_truncated/record_4.json b/extraction/non_truncated/record_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d8d3300f172c27069e50cedab82631aef67c5c03 --- /dev/null +++ b/extraction/non_truncated/record_4.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5dc03dd9e70cbdfbe2f1a2010686f18f984ca5c4d94e6f006de0d4225c43c57 +size 336622 diff --git a/extraction/non_truncated/record_40.json b/extraction/non_truncated/record_40.json new file mode 100644 index 0000000000000000000000000000000000000000..9338c8f24808b483b6dbb6d53592ee71f43b5f1a --- /dev/null +++ b/extraction/non_truncated/record_40.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6831ac023d481844a3368394b5853697f6d1ac5b7ee8042f606c2cc72de28492 +size 189523 diff --git a/extraction/non_truncated/record_41.json b/extraction/non_truncated/record_41.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe7e854a4a87c047f9ebe5804e975246986767a --- /dev/null +++ b/extraction/non_truncated/record_41.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2c3b9c753feb18018980143409d4a670a2b36f85f271d66e3574297391bb69 +size 1717290 diff --git a/extraction/non_truncated/record_42.json b/extraction/non_truncated/record_42.json new file mode 100644 index 0000000000000000000000000000000000000000..2e2cacdb145f2cca31ac469368cbcc791992714d --- /dev/null +++ b/extraction/non_truncated/record_42.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103d572230006e3231311feaa9fb18541fbe7347330c195e7e49f8462c131936 +size 504077 diff --git a/extraction/non_truncated/record_43.json b/extraction/non_truncated/record_43.json new file mode 100644 index 0000000000000000000000000000000000000000..d5bd23e52604cd16c1c8a06e1b926a0ea02501d8 --- /dev/null +++ b/extraction/non_truncated/record_43.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26bd2a27e3f7c1cae06a965f4b36333d9eac548238f155a9f22502c1edc9ed8 +size 178433 diff --git a/extraction/non_truncated/record_44.json b/extraction/non_truncated/record_44.json new file mode 100644 index 0000000000000000000000000000000000000000..fc8238bf1ce9d464f391807891228c425fd21b4a --- /dev/null +++ b/extraction/non_truncated/record_44.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3c4e63fe0d91dcc7927005b6bbce23aad1f6a76f438b12334952652bf5811f +size 578235 diff --git a/extraction/non_truncated/record_45.json b/extraction/non_truncated/record_45.json new file mode 100644 index 0000000000000000000000000000000000000000..f1e4aea189d610c9c06495a2d7f8edfed5ac4846 --- /dev/null +++ b/extraction/non_truncated/record_45.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4828368d4df51ee31aac1d332965e597d2ad7e7d69eb12c62547984d333b9c7 +size 65628 diff --git a/extraction/non_truncated/record_46.json b/extraction/non_truncated/record_46.json new file mode 100644 index 0000000000000000000000000000000000000000..e87043da1736b9ca9e498e0edacc0d57dae280f0 --- /dev/null +++ b/extraction/non_truncated/record_46.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a5c9af7071dc69a5a2ebc8ba8fa75e55dbcca845215d60cfd82662803f05b4 +size 353066 diff --git a/extraction/non_truncated/record_47.json b/extraction/non_truncated/record_47.json new file mode 100644 index 0000000000000000000000000000000000000000..06e89f852a5decd81c328e461f96d9a3fa0d546c --- /dev/null +++ b/extraction/non_truncated/record_47.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb7f41c1b0950e0a87846f0b79077eadbff0d70dad758904db03df28b4db761 +size 587671 diff --git a/extraction/non_truncated/record_48.json b/extraction/non_truncated/record_48.json new file mode 100644 index 0000000000000000000000000000000000000000..26ac45e3097418a3b09c5167e1d848a61546dd4c --- /dev/null +++ b/extraction/non_truncated/record_48.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a57ef801045de8bb4bd18e4ba6e9385a97b64b1263e8e2f215b53ef2c3a3785 +size 139104 diff --git a/extraction/non_truncated/record_49.json b/extraction/non_truncated/record_49.json new file mode 100644 index 0000000000000000000000000000000000000000..5292e76b0e7410ff139b60c43594e29e581b6841 --- /dev/null +++ b/extraction/non_truncated/record_49.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86703b12aaa2586771ad8140184dedfe719c490bfdeaf1098d7b447d03b5a1e3 +size 424608 diff --git a/extraction/non_truncated/record_5.json b/extraction/non_truncated/record_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eb458583a61a17fa59bfdfc47005943369df1bca --- /dev/null +++ b/extraction/non_truncated/record_5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e81014ac3d91bf012f645e1732f1292ab417a4bffe50f0a37d3ba5bb99cb0549 +size 463779 diff --git a/extraction/non_truncated/record_6.json b/extraction/non_truncated/record_6.json new file mode 100644 index 0000000000000000000000000000000000000000..098a587ba50d81238061302da5d8b3efb118a3ac --- /dev/null +++ b/extraction/non_truncated/record_6.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c581ce6b5af035a25ba1f0a0a3c3c25bb95bb3a8009c17a09430d9d51153a4e7 +size 723599 diff --git a/extraction/non_truncated/record_7.json b/extraction/non_truncated/record_7.json new file mode 100644 index 0000000000000000000000000000000000000000..5a304ce4d61d539f1f3b78b2a92d99f0f7394913 --- /dev/null +++ b/extraction/non_truncated/record_7.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e71c27130c52ca65accba4ca8415144d3aafc3167bafcc0e0736978e192b1745 +size 402046 diff --git a/extraction/non_truncated/record_8.json b/extraction/non_truncated/record_8.json new file mode 100644 index 0000000000000000000000000000000000000000..d11986d47d42afdd733c010b74362b0a84f203db --- /dev/null +++ b/extraction/non_truncated/record_8.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be972a275f06769baf5f75d7ee4e3b047c12a31b0aac6a66170af88a50362cc5 +size 1033561 diff --git a/extraction/non_truncated/record_9.json b/extraction/non_truncated/record_9.json new file mode 100644 index 0000000000000000000000000000000000000000..6932a93b1323e881c0b7b19164acd37b10cdf8d6 --- /dev/null +++ b/extraction/non_truncated/record_9.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce810c5186728b2fad7d8cf94216a547aeb72e7b0a701798ff90c1b572e49fb +size 760019 diff --git a/extraction/truncated/record_0.json b/extraction/truncated/record_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a49edcc577f73c477302f7bc3f7160a8126aefc6 --- /dev/null +++ b/extraction/truncated/record_0.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f854027e90ea9f907f07175a60050b6b8871147798f32390afb7640188061e +size 25756717 diff --git a/extraction/truncated/record_1.json b/extraction/truncated/record_1.json new file mode 100644 index 0000000000000000000000000000000000000000..021789b8d7ed6fc0719fc86c6262c7e9a1ddb452 --- /dev/null +++ b/extraction/truncated/record_1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b90300a8372c960618048b0d8c3b7ff093bf644a61033d915b80bd963806cf6 +size 4145981 diff --git a/extraction/truncated/record_10.json b/extraction/truncated/record_10.json new file mode 100644 index 0000000000000000000000000000000000000000..07aab0e2d65770d8b3310c18a22d79cef08470cf --- /dev/null +++ b/extraction/truncated/record_10.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26172de5fa420716f6d430793fbf89cee0ab660b7a17bd120d310cf4c5f1a3e +size 4194517 diff --git a/extraction/truncated/record_11.json b/extraction/truncated/record_11.json new file mode 100644 index 0000000000000000000000000000000000000000..65cc29fa6bad5b1c83e8e1c5f07939dbb9630c00 --- /dev/null +++ b/extraction/truncated/record_11.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2123eb5aa353531fa34c4b50cfc6d246df72f695a19cc94d4d8f292b089ec94f +size 2580533 diff --git a/extraction/truncated/record_12.json b/extraction/truncated/record_12.json new file mode 100644 index 0000000000000000000000000000000000000000..91b29cb2c1fce72d41503ee5a6a2e93305cfe5ca --- /dev/null +++ b/extraction/truncated/record_12.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef89883caaef3de85db14ce08255574c7700eb1a13c5d2903a23a06765723c5c +size 3263274 diff --git a/extraction/truncated/record_13.json b/extraction/truncated/record_13.json new file mode 100644 index 0000000000000000000000000000000000000000..39b065ae92b559b0b97e22026f73a65c98c98e12 --- /dev/null +++ b/extraction/truncated/record_13.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ee48fa675c83576ae4f2e700be03814c68b319bcdbba5a79f22a5a49d6107cf +size 41042075 diff --git a/extraction/truncated/record_14.json b/extraction/truncated/record_14.json new file mode 100644 index 0000000000000000000000000000000000000000..a3c9f3b375eaf9e5d38bb3c5c92a659b7227bf46 --- /dev/null +++ b/extraction/truncated/record_14.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d937bf2936be2ffd69f5da617721eda8daefb79795be425fe96ec4123a3a81d +size 8245268 diff --git a/extraction/truncated/record_15.json b/extraction/truncated/record_15.json new file mode 100644 index 0000000000000000000000000000000000000000..8c3069bbf4d83ff5b48bd68320b3c46c825ca9c9 --- /dev/null +++ b/extraction/truncated/record_15.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:031fb3f56bae12cde5cf90b0f8975b4ef4996859e8fa99467f82ed1f8918f2fa +size 24832502 diff --git a/extraction/truncated/record_16.json b/extraction/truncated/record_16.json new file mode 100644 index 0000000000000000000000000000000000000000..61a3b467a480054ee86040d4030d91dc54448127 --- /dev/null +++ b/extraction/truncated/record_16.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70a8d663ea0c7e1f9abb740e04b825d5ede2dfc6a42d5b462ca2292b59aa6e72 +size 10585522 diff --git a/extraction/truncated/record_17.json b/extraction/truncated/record_17.json new file mode 100644 index 0000000000000000000000000000000000000000..e7b985cc9f96b48ca2579725967267d463fce1b0 --- /dev/null +++ b/extraction/truncated/record_17.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:213bb6081b74e6a5e7317b5de229105a1225cb79e7205e99ef528a3f00f1ee1e +size 2200094 diff --git a/extraction/truncated/record_18.json b/extraction/truncated/record_18.json new file mode 100644 index 0000000000000000000000000000000000000000..198838d0c98777d24110b032e88ecf1b7e446c5b --- /dev/null +++ b/extraction/truncated/record_18.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9670772adf0fa0660dbf0432cda18327a6a6483ad5e05d1b14292c87ebe33628 +size 8938751 diff --git a/extraction/truncated/record_19.json b/extraction/truncated/record_19.json new file mode 100644 index 0000000000000000000000000000000000000000..5daece9f6c55753b7eceb010395d760de68689b8 --- /dev/null +++ b/extraction/truncated/record_19.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c0e0099a03748dc387c75c1754aa1f916449623f83b1c6b83b425a2d6c0284e +size 2321802 diff --git a/extraction/truncated/record_2.json b/extraction/truncated/record_2.json new file mode 100644 index 0000000000000000000000000000000000000000..572a1b595cd8f2b948e93cf94bbb4c3a73886b4d --- /dev/null +++ b/extraction/truncated/record_2.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0673f4d8c977afee6f5e0c22f6217cf9285552a4424958d79eb6d51e1fdd12a +size 7181245 diff --git a/extraction/truncated/record_20.json b/extraction/truncated/record_20.json new file mode 100644 index 0000000000000000000000000000000000000000..3178822925f59c2a090769c56a549377fe76c6df --- /dev/null +++ b/extraction/truncated/record_20.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:991245b6596640db34394f15c8a7ae16239707d9ceb175a9820d0dae21e23857 +size 3105031 diff --git a/extraction/truncated/record_21.json b/extraction/truncated/record_21.json new file mode 100644 index 0000000000000000000000000000000000000000..55cd682c41d5bf3d970bd60d5a966c81141dd3b8 --- /dev/null +++ b/extraction/truncated/record_21.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2447dc54a55ab851c0c484c3357126000849aaf4155e28aa6afcdaebd1e720bf +size 10944540 diff --git a/extraction/truncated/record_22.json b/extraction/truncated/record_22.json new file mode 100644 index 0000000000000000000000000000000000000000..e22920acdfb280cf817bd041ed7dbab4cdeb0ef5 --- /dev/null +++ b/extraction/truncated/record_22.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75187a3be16a8dc53e8fb28212f4e577d46ba6e052ca9df3a20c0eab5a91cbdb +size 2416047 diff --git a/extraction/truncated/record_23.json b/extraction/truncated/record_23.json new file mode 100644 index 0000000000000000000000000000000000000000..af17db423338e9eaf2e1f76c86bb5de0e3acd51f --- /dev/null +++ b/extraction/truncated/record_23.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad2d636f829de3d4bb72672441ff7a7a100b632ebca8476e5ad487b174a7b0c9 +size 4818556 diff --git a/extraction/truncated/record_24.json b/extraction/truncated/record_24.json new file mode 100644 index 0000000000000000000000000000000000000000..fa68211ee093ed5ef705b5dd4272a6d64789cf0a --- /dev/null +++ b/extraction/truncated/record_24.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18111223e430dc7ff325275c99e1d7ebfb67996879772d6f67616a2ebc7eb66 +size 1751628 diff --git a/extraction/truncated/record_25.json b/extraction/truncated/record_25.json new file mode 100644 index 0000000000000000000000000000000000000000..5c8133e6a28e363b16d0c9b08ab4c801a3e5f775 --- /dev/null +++ b/extraction/truncated/record_25.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:127a04149e2f7ac4116900d1a7121f83ec7c2f447d59903ec5bb2a93524b6f8e +size 17538244 diff --git a/extraction/truncated/record_26.json b/extraction/truncated/record_26.json new file mode 100644 index 0000000000000000000000000000000000000000..f2cd460905e7b907b43036ec7fc24aac9d73480a --- /dev/null +++ b/extraction/truncated/record_26.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:848c0b5224b721d57af7f55e29a7962df499b2ef12e91581558d0d9423361d06 +size 3356738 diff --git a/extraction/truncated/record_27.json b/extraction/truncated/record_27.json new file mode 100644 index 0000000000000000000000000000000000000000..4cc742daac06da161c51287d77b95a3e67d745da --- /dev/null +++ b/extraction/truncated/record_27.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dda37e766399267a1440fa7cc935147e87b1aef03e3a9ed4404628fbfed5e85 +size 3583167 diff --git a/extraction/truncated/record_28.json b/extraction/truncated/record_28.json new file mode 100644 index 0000000000000000000000000000000000000000..e46fbdd448cd9be649a68bc97fc5ad6c43dbd101 --- /dev/null +++ b/extraction/truncated/record_28.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8421b5ca38cd0ffc41dd6bd5fbc68c378f62bd8cd6b683696f5577d570c14c7 +size 4302378 diff --git a/extraction/truncated/record_29.json b/extraction/truncated/record_29.json new file mode 100644 index 0000000000000000000000000000000000000000..ff97c1b9639b3bb4920a7db57275884c29c00ea4 --- /dev/null +++ b/extraction/truncated/record_29.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccacb911774371f5baca43b6a6f12c8c3146582fc30e6dc29f13cebee3f32dc +size 3582508 diff --git a/extraction/truncated/record_3.json b/extraction/truncated/record_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9f166c1a0fff2d57710cb25e98c78b02c0eb3e8b --- /dev/null +++ b/extraction/truncated/record_3.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8496fefb4aae23d27ed8264b406b3538c398bf0a64a20a44eb717bc7bc146489 +size 9584404 diff --git a/extraction/truncated/record_30.json b/extraction/truncated/record_30.json new file mode 100644 index 0000000000000000000000000000000000000000..390bc8e4ada4c3b23cfc029faad0eb4bbb641a71 --- /dev/null +++ b/extraction/truncated/record_30.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c03b30a9fe4758d4784cdfbbbfb21f772cab1de362ba84bc2fbbe924f7a916 +size 1723861 diff --git a/extraction/truncated/record_31.json b/extraction/truncated/record_31.json new file mode 100644 index 0000000000000000000000000000000000000000..9544c20390699d5753e1b4e11dda3a97ef5e29da --- /dev/null +++ b/extraction/truncated/record_31.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9c6baae05618eb62f2b7db923a78f93d657ba8845ffc1d95afe27e7f3afb6b +size 21088951 diff --git a/extraction/truncated/record_32.json b/extraction/truncated/record_32.json new file mode 100644 index 0000000000000000000000000000000000000000..27279f2c3d75968aea88a06b3ee3cfed29c05aa8 --- /dev/null +++ b/extraction/truncated/record_32.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db93f7c4b60c496e2ff10413e0dceddf5b45db0a3c0515f9f73c8ccea92166e0 +size 5262069 diff --git a/extraction/truncated/record_33.json b/extraction/truncated/record_33.json new file mode 100644 index 0000000000000000000000000000000000000000..9c9d5f0add60b4f31174d84f1749f4756d0e3ce2 --- /dev/null +++ b/extraction/truncated/record_33.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151558fc315cc47bf8d9fa010a65a142970f425460a43933288f2871072150bf +size 59458095 diff --git a/extraction/truncated/record_34.json b/extraction/truncated/record_34.json new file mode 100644 index 0000000000000000000000000000000000000000..7f93a224d17ec6d4054676650b8fc3f32a4dd216 --- /dev/null +++ b/extraction/truncated/record_34.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:571dc26a89b61440bedd04e2b47feff4fa41203b6da252c75b40d957d4d02ed1 +size 7331042 diff --git a/extraction/truncated/record_35.json b/extraction/truncated/record_35.json new file mode 100644 index 0000000000000000000000000000000000000000..ede6387e89915a1cbb7cd3ad8e9db68ee6ee1b53 --- /dev/null +++ b/extraction/truncated/record_35.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e305511201d9892214e0f1c35552b343bb87b6d018b25c61e3181f5fdd50496 +size 1255709 diff --git a/extraction/truncated/record_36.json b/extraction/truncated/record_36.json new file mode 100644 index 0000000000000000000000000000000000000000..4abdd5ac39066b5e376712b4ad64bf817f94edf4 --- /dev/null +++ b/extraction/truncated/record_36.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff727e3a74f32bbcdd7df4a8fc54fa05cbee106af90ee41e33968c6b42f16a0 +size 9710246 diff --git a/extraction/truncated/record_37.json b/extraction/truncated/record_37.json new file mode 100644 index 0000000000000000000000000000000000000000..e02074a18b9efaf1442b2737461059752d03572e --- /dev/null +++ b/extraction/truncated/record_37.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62c302266f1792d3ad133d71ec6122648db66f2d4c5780a8f0dfe58193d4bafd +size 1595130 diff --git a/extraction/truncated/record_38.json b/extraction/truncated/record_38.json new file mode 100644 index 0000000000000000000000000000000000000000..41fa480378c99df1ee48ad15471450f555f80886 --- /dev/null +++ b/extraction/truncated/record_38.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287acb8b7e6beb5cd79dba63fa5908a60ed19f227d12c1bd073cc63493165b3d +size 2124023 diff --git a/extraction/truncated/record_39.json b/extraction/truncated/record_39.json new file mode 100644 index 0000000000000000000000000000000000000000..c677d7e17822a2eeb490ffa0968195cd5e9e4512 --- /dev/null +++ b/extraction/truncated/record_39.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc6886948acfa477ab9c4610a1ef021711d66ad5192c028f069369c671b7cf4 +size 8402211 diff --git a/extraction/truncated/record_4.json b/extraction/truncated/record_4.json new file mode 100644 index 0000000000000000000000000000000000000000..483a545e8385e77c81f2a9ba8120abeb62c4b9af --- /dev/null +++ b/extraction/truncated/record_4.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:687e384c43cdd0c984b2ded2d5be8242ff93e82e1316ab4ad1da6a438ebb5563 +size 6800433 diff --git a/extraction/truncated/record_5.json b/extraction/truncated/record_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ea9a97fc96cdb6883c3876a80c969e60381ded6a --- /dev/null +++ b/extraction/truncated/record_5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ce6c1802678e0302deb7d3833380311ed671c1408f65f539bdd5910607e08bd +size 4264032 diff --git a/extraction/truncated/record_6.json b/extraction/truncated/record_6.json new file mode 100644 index 0000000000000000000000000000000000000000..90c4104bfc88a5cc9745fb66d55cb5a7d1ed3aa4 --- /dev/null +++ b/extraction/truncated/record_6.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:536f130416453676aa8e3f517ce47a66e6ef95c66f2b58931b3c073ed2e6742d +size 20471457 diff --git a/extraction/truncated/record_7.json b/extraction/truncated/record_7.json new file mode 100644 index 0000000000000000000000000000000000000000..e7b28ca4cf565f5304ed8b1b77179a5940971995 --- /dev/null +++ b/extraction/truncated/record_7.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2501caa9a61be302e7ec61b79cd6c6f70c443777175e227190e9b66ca55ba751 +size 13878491 diff --git a/extraction/truncated/record_8.json b/extraction/truncated/record_8.json new file mode 100644 index 0000000000000000000000000000000000000000..3997ac0f0ed1cd550172e84e01b6b2c4a631d15a --- /dev/null +++ b/extraction/truncated/record_8.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47dfa5fceafd678dde151979cc4f9e38bb2cee5bafad2c21a8fb5a3c04f7734d +size 3308505 diff --git a/extraction/truncated/record_9.json b/extraction/truncated/record_9.json new file mode 100644 index 0000000000000000000000000000000000000000..262b0789a11c537ab76e76deeb6f9387060d22fa --- /dev/null +++ b/extraction/truncated/record_9.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5516b04c9ba03f923229fff21748d1966546ce39110efc475ea173cfc8f06fa +size 11377676 diff --git a/extractor_compare.py b/extractor_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..5e905a71b4a505e541cbe356709b20c99e69d802 --- /dev/null +++ b/extractor_compare.py @@ -0,0 +1,481 @@ +import gradio as gr +import os +import json +import base64 +import tempfile +from pathlib import Path + +EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm'] + +def add_page_breaks(text, page_offsets): + """Add page break markers to text based on page_offsets.""" + if not page_offsets: + return text + + result = [] + last_offset = 0 + for offset in page_offsets: + result.append(text[last_offset:offset]) + result.append("\n<---page-break--->\n") + last_offset = offset + + # Add any remaining text + if last_offset < len(text): + result.append(text[last_offset:]) + + return "".join(result) + +class ExtractorComparer: + def __init__(self): + self.json_files = [] + self.current_index = 0 + self.current_data = None + self.temp_pdf_path = None + self.current_pdf_bytes = None + + def load_files(self, directory_path): + """Load all JSON files from the specified directory.""" + self.json_files = [] + try: + for filename in os.listdir(directory_path): + if filename.endswith('.json') or filename.endswith('.jsonl'): + self.json_files.append(os.path.join(directory_path, filename)) + + if self.json_files: + self.current_index = 0 + file_progress, annotation_status = self.get_progress_info() + return file_progress, annotation_status + else: + return "No JSON files found", "No files loaded" + except Exception as e: + return f"Error loading files: {str(e)}", "Error" + + def load_current_file(self): + """Load the current JSON file data.""" + if not self.json_files: + return None, "N/A", "N/A" + + try: + with open(self.json_files[self.current_index], 'r') as f: + self.current_data = json.load(f) + + # Extract PDF bytes from pdf_plumber + pdf_bytes = None + debug_info = "" + if 'pdf_plumber' in self.current_data: + plumber_data = self.current_data['pdf_plumber'] + if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0: + media_item = plumber_data['media'][0] + if 'media_bytes' in media_item and media_item['media_bytes']: + try: + pdf_bytes = base64.b64decode(media_item['media_bytes']) + self.current_pdf_bytes = pdf_bytes + except Exception as e: + debug_info = f"Error decoding media_bytes: {str(e)}" + + # Create temporary file for the PDF if we have bytes + if pdf_bytes: + if self.temp_pdf_path: + try: + os.remove(self.temp_pdf_path) + except: + pass + + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + temp_file.write(pdf_bytes) + self.temp_pdf_path = temp_file.name + + # Convert to base64 for passing to the frontend + base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') + + # Generate progress information + file_progress, annotation_status = self.get_progress_info() + + return base64_pdf, file_progress, annotation_status + else: + file_progress, annotation_status = self.get_progress_info() + return None, file_progress, annotation_status + except Exception as e: + return None, "Error loading file", "No annotation" + + def get_progress_info(self): + """Generate progress information and annotation status.""" + if not self.json_files: + return "No files loaded", "No annotation" + + current_file = self.json_files[self.current_index] + filename = Path(current_file).name + + # File progress information + file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}" + + # Check if this file has been annotated with a best extractor + best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt" + annotation_status = "Not annotated" + + if os.path.exists(best_extractor_file): + try: + with open(best_extractor_file, 'r') as f: + best_extractor = f.read().strip() + annotation_status = f"Best extractor: {best_extractor}" + except: + pass + + # Count total annotated files + annotated_count = 0 + for json_file in self.json_files: + best_file = os.path.splitext(json_file)[0] + "_best.txt" + if os.path.exists(best_file): + annotated_count += 1 + + file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})" + + return file_progress, annotation_status + + def get_extractor_text(self, extractor_name): + """Get text with page breaks for the specified extractor.""" + if not self.current_data or extractor_name not in self.current_data: + return "" + + extractor_data = self.current_data[extractor_name] + if 'text' not in extractor_data: + return f"No text found for {extractor_name}" + + text = extractor_data.get('text', '') + + # Get page offsets + page_offsets = [] + if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0: + media_item = extractor_data['media'][0] + if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']: + page_offsets = media_item['metadata']['pdf_metadata']['page_offsets'] + + return add_page_breaks(text, page_offsets) + + def next_pdf(self): + """Load the next PDF in the list.""" + if not self.json_files: + return None, "N/A", "N/A" + + self.current_index = (self.current_index + 1) % len(self.json_files) + return self.load_current_file() + + def prev_pdf(self): + """Load the previous PDF in the list.""" + if not self.json_files: + return None, "N/A", "N/A" + + self.current_index = (self.current_index - 1) % len(self.json_files) + return self.load_current_file() + + def set_best_extractor(self, extractor_name): + """Record that this extractor is the best for the current file.""" + if not self.json_files or not self.current_data: + return "N/A", "N/A" + + try: + # Create a record about the best extractor + result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt" + with open(result_file, 'w') as f: + f.write(extractor_name) + + # Get updated progress info after annotation + file_progress, annotation_status = self.get_progress_info() + + return file_progress, annotation_status + except Exception as e: + return "Error saving annotation", "No annotation" + +def create_interface(): + comparer = ExtractorComparer() + + # Custom CSS for basic font in text areas + custom_css = """ + .extraction-text textarea { + font-family: Arial, Helvetica, sans-serif !important; + font-size: 14px !important; + line-height: 1.5 !important; + } + """ + + with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css) as demo: + gr.Markdown("## PDF Extractor Comparer") + + with gr.Row(): + directory_input = gr.Textbox( + label="Path to JSON Directory", + placeholder="e.g., /path/to/your/json/files" + ) + load_button = gr.Button("Load PDFs", variant="primary") + + # Main layout: PDF viewer on left, status and controls on right + with gr.Row(): + # Left column: PDF viewer + with gr.Column(scale=3): + # PDF viewer using iframe with JavaScript handling + pdf_viewer_html = gr.HTML( + label="PDF Document", + value=''' +