Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -312,7 +312,7 @@ def extract_text_outside_tables(root, table_paragraphs):
|
|
312 |
continue # Skip paragraphs inside tables
|
313 |
|
314 |
texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
|
315 |
-
line = clean_spaces(' '.join(texts).replace('
|
316 |
|
317 |
if ':' in line:
|
318 |
extracted_text.append(line)
|
@@ -334,6 +334,7 @@ def extract_docx_as_xml(file_bytes, save_xml=False, xml_filename="document.xml")
|
|
334 |
f.write(xml_content)
|
335 |
return xml_content
|
336 |
|
|
|
337 |
def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
|
338 |
|
339 |
tree = ET.ElementTree(ET.fromstring(xml_content))
|
@@ -349,7 +350,8 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
|
|
349 |
|
350 |
return json.dumps(extracted_data, ensure_ascii=False, indent=4)
|
351 |
|
352 |
-
|
|
|
353 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
354 |
|
355 |
# Step 1: Convert JSON string to Python dictionary
|
@@ -359,24 +361,24 @@ def deepseek_extract_contract_summary(json_data, save_json=False):
|
|
359 |
filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
|
360 |
|
361 |
# Step 3: Convert back to JSON string (if needed)
|
362 |
-
json_output = json.dumps(
|
363 |
|
364 |
prompt = """You are given a contract in JSON format. Extract the following information:
|
365 |
|
366 |
# Response Format
|
367 |
-
Return the extracted information as a structured JSON in the exact format shown below (Do not repeat any keys):
|
368 |
|
369 |
{
|
370 |
"合同编号":
|
371 |
-
"采购经办人":
|
372 |
-
"接收人":
|
373 |
"Recipient":
|
374 |
-
"接收地":
|
375 |
-
"Place of receipt":
|
376 |
"供应商":
|
377 |
-
"币种":
|
378 |
-
"合同日期":
|
379 |
-
"供货日期":
|
380 |
}
|
381 |
|
382 |
Contract data in JSON format:""" + f"""
|
@@ -389,6 +391,7 @@ Contract data in JSON format:""" + f"""
|
|
389 |
}
|
390 |
]
|
391 |
|
|
|
392 |
client = OpenAI(
|
393 |
base_url="https://router.huggingface.co/novita",
|
394 |
api_key=HF_API_KEY,
|
@@ -397,18 +400,48 @@ Contract data in JSON format:""" + f"""
|
|
397 |
completion = client.chat.completions.create(
|
398 |
model="deepseek/deepseek-r1-distill-qwen-14b",
|
399 |
messages=messages,
|
|
|
400 |
)
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
|
403 |
|
404 |
contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
|
405 |
|
406 |
if save_json:
|
407 |
-
with open(
|
408 |
f.write(contract_summary)
|
409 |
|
410 |
return json.dumps(contract_summary, ensure_ascii=False, indent=4)
|
411 |
|
|
|
412 |
def deepseek_extract_price_list(json_data):
|
413 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
414 |
|
@@ -486,17 +519,20 @@ def extract_po(docx_path):
|
|
486 |
|
487 |
# Step 1: Extract XML content from DOCX
|
488 |
print("Extracting Docs data to XML...")
|
489 |
-
|
|
|
490 |
|
491 |
get_namespace(ET.fromstring(xml_file))
|
492 |
|
493 |
# Step 2: Extract tables from DOCX and save JSON
|
494 |
print("Extracting XML data to JSON...")
|
495 |
-
|
|
|
496 |
|
497 |
# Step 2: Process JSON with OpenAI to get structured output
|
498 |
print("Processing JSON data with AI...")
|
499 |
-
|
|
|
500 |
|
501 |
# Step 3: Save formatted data as Excel
|
502 |
print("Converting AI Generated JSON to Excel...")
|
@@ -541,4 +577,4 @@ interface = gr.Interface(
|
|
541 |
theme=Base()
|
542 |
)
|
543 |
|
544 |
-
interface.launch()
|
|
|
312 |
continue # Skip paragraphs inside tables
|
313 |
|
314 |
texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
|
315 |
+
line = clean_spaces(' '.join(texts).replace(':',':')) # Clean colons and spaces
|
316 |
|
317 |
if ':' in line:
|
318 |
extracted_text.append(line)
|
|
|
334 |
f.write(xml_content)
|
335 |
return xml_content
|
336 |
|
337 |
+
|
338 |
def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
|
339 |
|
340 |
tree = ET.ElementTree(ET.fromstring(xml_content))
|
|
|
350 |
|
351 |
return json.dumps(extracted_data, ensure_ascii=False, indent=4)
|
352 |
|
353 |
+
|
354 |
+
def deepseek_extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
|
355 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
356 |
|
357 |
# Step 1: Convert JSON string to Python dictionary
|
|
|
361 |
filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
|
362 |
|
363 |
# Step 3: Convert back to JSON string (if needed)
|
364 |
+
json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
|
365 |
|
366 |
prompt = """You are given a contract in JSON format. Extract the following information:
|
367 |
|
368 |
# Response Format
|
369 |
+
Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
|
370 |
|
371 |
{
|
372 |
"合同编号":
|
373 |
+
"采购经办人": (注意:不是买家必须是采购经办人,不是一个公司而是一个人)
|
374 |
+
"接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
|
375 |
"Recipient":
|
376 |
+
"接收地": (注意:不是交货地点是目的港)
|
377 |
+
"Place of receipt": (如果接收地有英文填在这里)
|
378 |
"供应商":
|
379 |
+
"币种": (主要用的货币,填英文缩写。GNF一般是为了方便而转换出来的, 除非只有GNF,GNF一般不是主要币种。)
|
380 |
+
"合同日期":
|
381 |
+
"供货日期": 必须是一个日期,而不是天数
|
382 |
}
|
383 |
|
384 |
Contract data in JSON format:""" + f"""
|
|
|
391 |
}
|
392 |
]
|
393 |
|
394 |
+
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
395 |
client = OpenAI(
|
396 |
base_url="https://router.huggingface.co/novita",
|
397 |
api_key=HF_API_KEY,
|
|
|
400 |
completion = client.chat.completions.create(
|
401 |
model="deepseek/deepseek-r1-distill-qwen-14b",
|
402 |
messages=messages,
|
403 |
+
temperature=0.5,
|
404 |
)
|
405 |
|
406 |
+
# Deepseek V3 --------------------------------
|
407 |
+
# client = OpenAI(
|
408 |
+
# base_url="https://router.huggingface.co/novita",
|
409 |
+
# api_key=HF_API_KEY,
|
410 |
+
# )
|
411 |
+
|
412 |
+
# completion = client.chat.completions.create(
|
413 |
+
# model="deepseek/deepseek_v3",
|
414 |
+
# messages=messages,
|
415 |
+
# temperature=0.1,
|
416 |
+
# )
|
417 |
+
|
418 |
+
# Qwen 2.5 7B --------------------------------
|
419 |
+
# client = OpenAI(
|
420 |
+
# base_url="https://router.huggingface.co/together",
|
421 |
+
# api_key=HF_API_KEY,
|
422 |
+
# )
|
423 |
+
|
424 |
+
# completion = client.chat.completions.create(
|
425 |
+
# model="Qwen/Qwen2.5-7B-Instruct-Turbo",
|
426 |
+
# messages=messages,
|
427 |
+
# )
|
428 |
+
|
429 |
+
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
430 |
+
if think_text:
|
431 |
+
print(f"Thought Process: {think_text}")
|
432 |
+
logging.info(f"Think text: {think_text}")
|
433 |
+
|
434 |
contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
|
435 |
|
436 |
contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
|
437 |
|
438 |
if save_json:
|
439 |
+
with open(json_filename, "w", encoding="utf-8") as f:
|
440 |
f.write(contract_summary)
|
441 |
|
442 |
return json.dumps(contract_summary, ensure_ascii=False, indent=4)
|
443 |
|
444 |
+
|
445 |
def deepseek_extract_price_list(json_data):
|
446 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
447 |
|
|
|
519 |
|
520 |
# Step 1: Extract XML content from DOCX
|
521 |
print("Extracting Docs data to XML...")
|
522 |
+
xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
|
523 |
+
xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
|
524 |
|
525 |
get_namespace(ET.fromstring(xml_file))
|
526 |
|
527 |
# Step 2: Extract tables from DOCX and save JSON
|
528 |
print("Extracting XML data to JSON...")
|
529 |
+
json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
|
530 |
+
extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
|
531 |
|
532 |
# Step 2: Process JSON with OpenAI to get structured output
|
533 |
print("Processing JSON data with AI...")
|
534 |
+
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
535 |
+
contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
|
536 |
|
537 |
# Step 3: Save formatted data as Excel
|
538 |
print("Converting AI Generated JSON to Excel...")
|
|
|
577 |
theme=Base()
|
578 |
)
|
579 |
|
580 |
+
interface.launch()
|