MikeMai commited on
Commit
20e903b
·
verified ·
1 Parent(s): 4ad44cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -76
app.py CHANGED
@@ -16,6 +16,9 @@ import re
16
 
17
  import logging
18
 
 
 
 
19
 
20
  HF_API_KEY = os.getenv("HF_API_KEY")
21
 
@@ -247,7 +250,28 @@ def process_long_table(rows):
247
 
248
  table_data.append(row_data)
249
 
250
- return table_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  def extract_tables(root):
253
  """Extracts tables from the DOCX document and returns structured data."""
@@ -402,29 +426,6 @@ Contract data in JSON format:""" + f"""
402
  temperature=0.5,
403
  )
404
 
405
- # Deepseek V3 --------------------------------
406
- # client = OpenAI(
407
- # base_url="https://router.huggingface.co/novita",
408
- # api_key=HF_API_KEY,
409
- # )
410
-
411
- # completion = client.chat.completions.create(
412
- # model="deepseek/deepseek_v3",
413
- # messages=messages,
414
- # temperature=0.1,
415
- # )
416
-
417
- # Qwen 2.5 7B --------------------------------
418
- # client = OpenAI(
419
- # base_url="https://router.huggingface.co/together",
420
- # api_key=HF_API_KEY,
421
- # )
422
-
423
- # completion = client.chat.completions.create(
424
- # model="Qwen/Qwen2.5-7B-Instruct-Turbo",
425
- # messages=messages,
426
- # )
427
-
428
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
429
  if think_text:
430
  print(f"Thought Process: {think_text}")
@@ -441,50 +442,110 @@ Contract data in JSON format:""" + f"""
441
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
442
 
443
 
444
- def deepseek_extract_price_list(json_data):
445
- """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
446
-
447
- # Step 1: Convert JSON string to Python dictionary
448
- contract_data = json.loads(json_data)
449
 
450
- # Step 2: Remove keys that contain "long_table"
451
- filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" in key}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- # Step 3: Convert back to JSON string (if needed)
454
- json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
455
-
456
- prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
457
 
458
- # Response Format
459
- Return the extracted information as a CSV in the exact format shown below:
460
 
461
- 物料名称, 物料名称(英文), 物料规格, 采购数量, 单位, 单价, 计划号
 
 
 
 
 
462
 
463
- JSON data:""" + f"""
464
- {json_output}"""
 
465
 
466
- messages = [
467
- {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  "role": "user",
469
- "content": prompt
470
- }
471
- ]
472
 
473
- client = OpenAI(
474
- base_url="https://router.huggingface.co/novita",
475
- api_key=HF_API_KEY,
476
- )
477
-
478
- completion = client.chat.completions.create(
479
- model="deepseek/deepseek-r1-distill-qwen-14b",
480
- messages=messages,
481
- )
482
-
483
- price_list = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL)
484
 
485
- price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
486
 
487
-
488
  def json_to_excel(contract_summary, json_data, excel_path):
489
  """Converts extracted JSON tables to an Excel file."""
490
 
@@ -507,7 +568,7 @@ def json_to_excel(contract_summary, json_data, excel_path):
507
  #--- Extract PO ------------------------------
508
 
509
  def extract_po(docx_path):
510
- """Processes a single .docx file, extracts tables, formats with OpenAI, and saves as an Excel file."""
511
  if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
512
  raise ValueError(f"Invalid file: {docx_path}")
513
 
@@ -518,28 +579,42 @@ def extract_po(docx_path):
518
  # Step 1: Extract XML content from DOCX
519
  print("Extracting Docs data to XML...")
520
  xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
521
- xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
522
 
523
  get_namespace(ET.fromstring(xml_file))
524
 
525
  # Step 2: Extract tables from DOCX and save JSON
526
  print("Extracting XML data to JSON...")
527
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
528
- extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
529
 
530
- # Step 2: Process JSON with OpenAI to get structured output
531
- print("Processing JSON data with AI...")
532
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
533
- contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
534
-
535
- # Step 3: Save formatted data as Excel
536
- print("Converting AI Generated JSON to Excel...")
537
- excel_output_path = os.path.splitext(docx_path)[0] + ".xlsx"
538
- json_to_excel(contract_summary, extracted_data, excel_output_path)
539
-
540
- print(f"Excel file saved at: {excel_output_path}")
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
 
 
 
 
 
543
  # Logging
544
  log = f"""Results:
545
 
@@ -547,20 +622,20 @@ def extract_po(docx_path):
547
 
548
  RAW Extracted Data: {extracted_data},
549
 
550
- XML Preview: {xml_file[:1000]}"""
551
 
552
  print(log)
553
-
554
  logging.info(f"""{log}""")
555
 
556
-
557
- return excel_output_path
558
 
559
  # Example Usage
560
 
561
  # extract_po("test-contract-converted.docx")
562
  # extract_po("test-contract.docx")
563
 
 
 
564
  # Gradio Interface ------------------------------
565
 
566
  import gradio as gr
@@ -570,9 +645,10 @@ interface = gr.Interface(
570
  fn=extract_po,
571
  title="PO Extractor 买卖合同数据提取",
572
  inputs=gr.File(label="买卖合同 (.docx)"),
573
- outputs=gr.File(label="数据提取结果 (.xlsx)"),
574
  flagging_mode="never",
575
  theme=Base()
576
  )
577
 
578
  interface.launch()
 
 
16
 
17
  import logging
18
 
19
+ from pydantic import BaseModel, Field, ValidationError, RootModel
20
+ from typing import List, Optional
21
+
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
 
250
 
251
  table_data.append(row_data)
252
 
253
+ # Filter out rows where the "序号" column contains non-numeric values
254
+ filtered_table_data = []
255
+ for row in table_data:
256
+ # Check potential serial number columns (use both Chinese and English variants)
257
+ serial_number = None
258
+ for column in row:
259
+ if any(term in column for term in ["序号"]):
260
+ serial_number = row[column]
261
+ break
262
+
263
+ # If we found a serial number column, check if its value is numeric
264
+ if serial_number is not None:
265
+ # Strip any non-numeric characters and check if there's still a value
266
+ # This keeps values like "1", "2." etc. but filters out "No." or other text
267
+ cleaned_number = re.sub(r'[^\d]', '', serial_number)
268
+ if cleaned_number: # If there are any digits left, keep the row
269
+ filtered_table_data.append(row)
270
+ else:
271
+ # If we couldn't find a serial number column, keep the row
272
+ filtered_table_data.append(row)
273
+
274
+ return filtered_table_data
275
 
276
  def extract_tables(root):
277
  """Extracts tables from the DOCX document and returns structured data."""
 
426
  temperature=0.5,
427
  )
428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
430
  if think_text:
431
  print(f"Thought Process: {think_text}")
 
442
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
443
 
444
 
445
+ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
446
+ """
447
+ Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
448
+ Retries up to 3 times with error feedback if output is not valid JSON.
449
+ """
450
 
451
+ # Pydantic schema
452
+ class PriceItem(BaseModel):
453
+ 序号: str
454
+ 名称: str
455
+ 名称_英文: str = Field(..., alias="名称(英文)")
456
+ 品牌: str
457
+ 规格: str
458
+ 所属机型: str
459
+ 采购数量: str
460
+ 单位: str
461
+ 单价: str
462
+ 总价: str
463
+ 几郎单价: str
464
+ 几郎总额: str
465
+ 备注: str
466
+ 计划来源: str
467
+ 其他: dict = Field(default_factory=dict, alias="其他")
468
+
469
+ class PriceListModel(BaseModel):
470
+ items: List[PriceItem]
471
+
472
+ base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
473
+ 有时候第一行是表头,有时候是数据行,只输入数据行。请注意,输出的 JSON 需要符合以下格式要求:
474
+
475
+ # 输出格式要求:
476
+ 每个条目输出以下字段:
477
+ - 序号
478
+ - 名称:只填中文
479
+ - 名称(英文):只填英文
480
+ - 品牌
481
+ - 规格
482
+ - 所属机型
483
+ - 采购数量
484
+ - 单位
485
+ - 单价: 只填数字
486
+ - 总价: 只填数字
487
+ - 几郎单价: 只填数字
488
+ - 几郎总额: 只填数字
489
+ - 备注
490
+ - 计划来源
491
+ - 其他:如果有以上以外的字段就以list的形式写在其他里 ("其他": "key1": "value1", "key2":"value2"),如果没有就给一个空的list
492
+
493
+ 请确保输出的 JSON 是有效的,且字段名称与输入的字段名称一致。请注意,字段名称可能会有不同的拼写方式,请根据上下文进行判断。
494
+ 请确保输出的条目数量与输入的列表数量一致。
495
+
496
+ # 原始价格表:
497
+ {price_list}"""
498
+
499
+ messages = [{"role": "user", "content": base_prompt}]
500
 
501
+ client = OpenAI(
502
+ base_url="https://router.huggingface.co/novita",
503
+ api_key=HF_API_KEY,
504
+ )
505
 
506
+ for attempt in range(3):
507
+ print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
508
 
509
+ try:
510
+ response = client.chat.completions.create(
511
+ model="deepseek/deepseek-r1-distill-qwen-14b",
512
+ messages=messages,
513
+ )
514
+ raw = response.choices[0].message.content
515
 
516
+ # Strip out LLM artifacts
517
+ raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
518
+ raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
519
 
520
+ # Wrap the raw JSON in a proper structure if it's a list
521
+ if raw.strip().startswith('['):
522
+ raw = '{"items": ' + raw + '}'
523
+
524
+ validated = PriceListModel.model_validate_json(raw)
525
+ price_list_json = validated.model_dump(by_alias=True)["items"]
526
+
527
+ if save_json:
528
+ with open(json_name, "w", encoding="utf-8") as f:
529
+ json.dump(price_list_json, f, ensure_ascii=False, indent=4)
530
+ print(f"✅ Saved to {json_name}")
531
+
532
+ return price_list_json
533
+
534
+ except ValidationError as ve:
535
+ error_msg = f"Pydantic validation error: {ve}"
536
+ except Exception as e:
537
+ error_msg = f"Unexpected error: {e}"
538
+
539
+ print(f"❌ {error_msg}")
540
+ messages.append({
541
  "role": "user",
542
+ "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
543
+ })
 
544
 
545
+ print("⚠️ Failed after 3 attempts.")
546
+ return raw
 
 
 
 
 
 
 
 
 
547
 
 
548
 
 
549
  def json_to_excel(contract_summary, json_data, excel_path):
550
  """Converts extracted JSON tables to an Excel file."""
551
 
 
568
  #--- Extract PO ------------------------------
569
 
570
  def extract_po(docx_path):
571
+ """Processes a single .docx file, extracts tables, formats with OpenAI, and returns combined JSON data."""
572
  if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
573
  raise ValueError(f"Invalid file: {docx_path}")
574
 
 
579
  # Step 1: Extract XML content from DOCX
580
  print("Extracting Docs data to XML...")
581
  xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
582
+ xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
583
 
584
  get_namespace(ET.fromstring(xml_file))
585
 
586
  # Step 2: Extract tables from DOCX and save JSON
587
  print("Extracting XML data to JSON...")
588
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
589
+ extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
590
 
591
+ # Step 3: Process JSON with OpenAI to get structured output
592
+ print("Processing Contract Summary data with AI...")
593
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
594
+ contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=False, json_filename=contract_summary_filename)
 
 
 
 
 
 
 
595
 
596
+ # Find the last long table (excluding summary tables)
597
+ print("Processing Price List data with AI...")
598
+ long_tables = [
599
+ table for key, table in json.loads(extracted_data).items()
600
+ if "long_table" in key and "summary" not in key
601
+ ]
602
+ last_long_table = long_tables[-1] if long_tables else {}
603
+
604
+ # Generate the price list filename in the same folder as the document
605
+ price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
606
+
607
+ # Process the price list and save it to a JSON file
608
+ price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
609
+
610
+ # Step 4: Combine contract summary and long table data into a single JSON object
611
+ print("Combining AI Generated JSON with Extracted Data...")
612
 
613
+ combined_data = {
614
+ "contract_summary": json.loads(json.loads(contract_summary)),
615
+ "price_list": price_list
616
+ }
617
+
618
  # Logging
619
  log = f"""Results:
620
 
 
622
 
623
  RAW Extracted Data: {extracted_data},
624
 
625
+ Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
626
 
627
  print(log)
 
628
  logging.info(f"""{log}""")
629
 
630
+ return combined_data
 
631
 
632
  # Example Usage
633
 
634
  # extract_po("test-contract-converted.docx")
635
  # extract_po("test-contract.docx")
636
 
637
+ # print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
638
+
639
  # Gradio Interface ------------------------------
640
 
641
  import gradio as gr
 
645
  fn=extract_po,
646
  title="PO Extractor 买卖合同数据提取",
647
  inputs=gr.File(label="买卖合同 (.docx)"),
648
+ outputs=gr.Json(label="提取结果"),
649
  flagging_mode="never",
650
  theme=Base()
651
  )
652
 
653
  interface.launch()
654
+