MikeMai commited on
Commit
3c93df8
·
verified ·
1 Parent(s): 20e903b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -152
app.py CHANGED
@@ -16,9 +16,6 @@ import re
16
 
17
  import logging
18
 
19
- from pydantic import BaseModel, Field, ValidationError, RootModel
20
- from typing import List, Optional
21
-
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
@@ -250,28 +247,7 @@ def process_long_table(rows):
250
 
251
  table_data.append(row_data)
252
 
253
- # Filter out rows where the "序号" column contains non-numeric values
254
- filtered_table_data = []
255
- for row in table_data:
256
- # Check potential serial number columns (use both Chinese and English variants)
257
- serial_number = None
258
- for column in row:
259
- if any(term in column for term in ["序号"]):
260
- serial_number = row[column]
261
- break
262
-
263
- # If we found a serial number column, check if its value is numeric
264
- if serial_number is not None:
265
- # Strip any non-numeric characters and check if there's still a value
266
- # This keeps values like "1", "2." etc. but filters out "No." or other text
267
- cleaned_number = re.sub(r'[^\d]', '', serial_number)
268
- if cleaned_number: # If there are any digits left, keep the row
269
- filtered_table_data.append(row)
270
- else:
271
- # If we couldn't find a serial number column, keep the row
272
- filtered_table_data.append(row)
273
-
274
- return filtered_table_data
275
 
276
  def extract_tables(root):
277
  """Extracts tables from the DOCX document and returns structured data."""
@@ -426,6 +402,29 @@ Contract data in JSON format:""" + f"""
426
  temperature=0.5,
427
  )
428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
430
  if think_text:
431
  print(f"Thought Process: {think_text}")
@@ -442,110 +441,50 @@ Contract data in JSON format:""" + f"""
442
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
443
 
444
 
445
- def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
446
- """
447
- Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
448
- Retries up to 3 times with error feedback if output is not valid JSON.
449
- """
450
-
451
- # Pydantic schema
452
- class PriceItem(BaseModel):
453
- 序号: str
454
- 名称: str
455
- 名称_英文: str = Field(..., alias="名称(英文)")
456
- 品牌: str
457
- 规格: str
458
- 所属机型: str
459
- 采购数量: str
460
- 单位: str
461
- 单价: str
462
- 总价: str
463
- 几郎单价: str
464
- 几郎总额: str
465
- 备注: str
466
- 计划来源: str
467
- 其他: dict = Field(default_factory=dict, alias="其他")
468
-
469
- class PriceListModel(BaseModel):
470
- items: List[PriceItem]
471
-
472
- base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
473
- 有时候第一行是表头,有时候是数据行,只输入数据行。请注意,输出的 JSON 需要符合以下格式要求:
474
-
475
- # 输出格式要求:
476
- 每个条目输出以下字段:
477
- - 序号
478
- - 名称:只填中文
479
- - 名称(英文):只填英文
480
- - 品牌
481
- - 规格
482
- - 所属机型
483
- - 采购数量
484
- - 单位
485
- - 单价: 只填数字
486
- - 总价: 只填数字
487
- - 几郎单价: 只填数字
488
- - 几郎总额: 只填数字
489
- - 备注
490
- - 计划来源
491
- - 其他:如果有以上以外的字段就以list的形式写在其他里 ("其他": "key1": "value1", "key2":"value2"),如果没有就给一个空的list
492
-
493
- 请确保输出的 JSON 是有效的,且字段名称与输入的字段名称一致。请注意,字段名称可能会有不同的拼写方式,请根据上下文进行判断。
494
- 请确保输出的条目数量与输入的列表数量一致。
495
-
496
- # 原始价格表:
497
- {price_list}"""
498
-
499
- messages = [{"role": "user", "content": base_prompt}]
500
-
501
- client = OpenAI(
502
- base_url="https://router.huggingface.co/novita",
503
- api_key=HF_API_KEY,
504
- )
505
-
506
- for attempt in range(3):
507
- print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
508
-
509
- try:
510
- response = client.chat.completions.create(
511
- model="deepseek/deepseek-r1-distill-qwen-14b",
512
- messages=messages,
513
- )
514
- raw = response.choices[0].message.content
515
-
516
- # Strip out LLM artifacts
517
- raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
518
- raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
519
 
520
- # Wrap the raw JSON in a proper structure if it's a list
521
- if raw.strip().startswith('['):
522
- raw = '{"items": ' + raw + '}'
523
 
524
- validated = PriceListModel.model_validate_json(raw)
525
- price_list_json = validated.model_dump(by_alias=True)["items"]
 
 
526
 
527
- if save_json:
528
- with open(json_name, "w", encoding="utf-8") as f:
529
- json.dump(price_list_json, f, ensure_ascii=False, indent=4)
530
- print(f"✅ Saved to {json_name}")
531
 
532
- return price_list_json
533
 
534
- except ValidationError as ve:
535
- error_msg = f"Pydantic validation error: {ve}"
536
- except Exception as e:
537
- error_msg = f"Unexpected error: {e}"
538
 
539
- print(f"❌ {error_msg}")
540
- messages.append({
541
  "role": "user",
542
- "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
543
- })
 
544
 
545
- print("⚠️ Failed after 3 attempts.")
546
- return raw
 
 
 
 
 
 
 
 
 
547
 
 
548
 
 
549
  def json_to_excel(contract_summary, json_data, excel_path):
550
  """Converts extracted JSON tables to an Excel file."""
551
 
@@ -568,7 +507,7 @@ def json_to_excel(contract_summary, json_data, excel_path):
568
  #--- Extract PO ------------------------------
569
 
570
  def extract_po(docx_path):
571
- """Processes a single .docx file, extracts tables, formats with OpenAI, and returns combined JSON data."""
572
  if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
573
  raise ValueError(f"Invalid file: {docx_path}")
574
 
@@ -579,42 +518,28 @@ def extract_po(docx_path):
579
  # Step 1: Extract XML content from DOCX
580
  print("Extracting Docs data to XML...")
581
  xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
582
- xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
583
 
584
  get_namespace(ET.fromstring(xml_file))
585
 
586
  # Step 2: Extract tables from DOCX and save JSON
587
  print("Extracting XML data to JSON...")
588
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
589
- extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
590
 
591
- # Step 3: Process JSON with OpenAI to get structured output
592
- print("Processing Contract Summary data with AI...")
593
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
594
- contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=False, json_filename=contract_summary_filename)
595
-
596
- # Find the last long table (excluding summary tables)
597
- print("Processing Price List data with AI...")
598
- long_tables = [
599
- table for key, table in json.loads(extracted_data).items()
600
- if "long_table" in key and "summary" not in key
601
- ]
602
- last_long_table = long_tables[-1] if long_tables else {}
603
-
604
- # Generate the price list filename in the same folder as the document
605
- price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
606
-
607
- # Process the price list and save it to a JSON file
608
- price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
609
 
610
- # Step 4: Combine contract summary and long table data into a single JSON object
611
- print("Combining AI Generated JSON with Extracted Data...")
612
-
613
- combined_data = {
614
- "contract_summary": json.loads(json.loads(contract_summary)),
615
- "price_list": price_list
616
- }
617
 
 
 
 
618
  # Logging
619
  log = f"""Results:
620
 
@@ -622,20 +547,20 @@ def extract_po(docx_path):
622
 
623
  RAW Extracted Data: {extracted_data},
624
 
625
- Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
626
 
627
  print(log)
 
628
  logging.info(f"""{log}""")
629
 
630
- return combined_data
 
631
 
632
  # Example Usage
633
 
634
  # extract_po("test-contract-converted.docx")
635
  # extract_po("test-contract.docx")
636
 
637
- # print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
638
-
639
  # Gradio Interface ------------------------------
640
 
641
  import gradio as gr
@@ -645,10 +570,9 @@ interface = gr.Interface(
645
  fn=extract_po,
646
  title="PO Extractor 买卖合同数据提取",
647
  inputs=gr.File(label="买卖合同 (.docx)"),
648
- outputs=gr.Json(label="提取结果"),
649
  flagging_mode="never",
650
  theme=Base()
651
  )
652
 
653
  interface.launch()
654
-
 
16
 
17
  import logging
18
 
 
 
 
19
 
20
  HF_API_KEY = os.getenv("HF_API_KEY")
21
 
 
247
 
248
  table_data.append(row_data)
249
 
250
+ return table_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  def extract_tables(root):
253
  """Extracts tables from the DOCX document and returns structured data."""
 
402
  temperature=0.5,
403
  )
404
 
405
+ # Deepseek V3 --------------------------------
406
+ # client = OpenAI(
407
+ # base_url="https://router.huggingface.co/novita",
408
+ # api_key=HF_API_KEY,
409
+ # )
410
+
411
+ # completion = client.chat.completions.create(
412
+ # model="deepseek/deepseek_v3",
413
+ # messages=messages,
414
+ # temperature=0.1,
415
+ # )
416
+
417
+ # Qwen 2.5 7B --------------------------------
418
+ # client = OpenAI(
419
+ # base_url="https://router.huggingface.co/together",
420
+ # api_key=HF_API_KEY,
421
+ # )
422
+
423
+ # completion = client.chat.completions.create(
424
+ # model="Qwen/Qwen2.5-7B-Instruct-Turbo",
425
+ # messages=messages,
426
+ # )
427
+
428
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
429
  if think_text:
430
  print(f"Thought Process: {think_text}")
 
441
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
442
 
443
 
444
+ def deepseek_extract_price_list(json_data):
445
+ """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
446
+
447
+ # Step 1: Convert JSON string to Python dictionary
448
+ contract_data = json.loads(json_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
+ # Step 2: Remove keys that contain "long_table"
451
+ filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" in key}
 
452
 
453
+ # Step 3: Convert back to JSON string (if needed)
454
+ json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
455
+
456
+ prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
457
 
458
+ # Response Format
459
+ Return the extracted information as a CSV in the exact format shown below:
 
 
460
 
461
+ 物料名称, 物料名称(英文), 物料规格, 采购数量, 单位, 单价, 计划号
462
 
463
+ JSON data:""" + f"""
464
+ {json_output}"""
 
 
465
 
466
+ messages = [
467
+ {
468
  "role": "user",
469
+ "content": prompt
470
+ }
471
+ ]
472
 
473
+ client = OpenAI(
474
+ base_url="https://router.huggingface.co/novita",
475
+ api_key=HF_API_KEY,
476
+ )
477
+
478
+ completion = client.chat.completions.create(
479
+ model="deepseek/deepseek-r1-distill-qwen-14b",
480
+ messages=messages,
481
+ )
482
+
483
+ price_list = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL)
484
 
485
+ price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
486
 
487
+
488
  def json_to_excel(contract_summary, json_data, excel_path):
489
  """Converts extracted JSON tables to an Excel file."""
490
 
 
507
  #--- Extract PO ------------------------------
508
 
509
  def extract_po(docx_path):
510
+ """Processes a single .docx file, extracts tables, formats with OpenAI, and saves as an Excel file."""
511
  if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
512
  raise ValueError(f"Invalid file: {docx_path}")
513
 
 
518
  # Step 1: Extract XML content from DOCX
519
  print("Extracting Docs data to XML...")
520
  xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
521
+ xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
522
 
523
  get_namespace(ET.fromstring(xml_file))
524
 
525
  # Step 2: Extract tables from DOCX and save JSON
526
  print("Extracting XML data to JSON...")
527
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
528
+ extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
529
 
530
+ # Step 2: Process JSON with OpenAI to get structured output
531
+ print("Processing JSON data with AI...")
532
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
533
+ contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
+ # Step 3: Save formatted data as Excel
536
+ print("Converting AI Generated JSON to Excel...")
537
+ excel_output_path = os.path.splitext(docx_path)[0] + ".xlsx"
538
+ json_to_excel(contract_summary, extracted_data, excel_output_path)
 
 
 
539
 
540
+ print(f"Excel file saved at: {excel_output_path}")
541
+
542
+
543
  # Logging
544
  log = f"""Results:
545
 
 
547
 
548
  RAW Extracted Data: {extracted_data},
549
 
550
+ XML Preview: {xml_file[:1000]}"""
551
 
552
  print(log)
553
+
554
  logging.info(f"""{log}""")
555
 
556
+
557
+ return excel_output_path
558
 
559
  # Example Usage
560
 
561
  # extract_po("test-contract-converted.docx")
562
  # extract_po("test-contract.docx")
563
 
 
 
564
  # Gradio Interface ------------------------------
565
 
566
  import gradio as gr
 
570
  fn=extract_po,
571
  title="PO Extractor 买卖合同数据提取",
572
  inputs=gr.File(label="买卖合同 (.docx)"),
573
+ outputs=gr.File(label="数据提取结果 (.xlsx)"),
574
  flagging_mode="never",
575
  theme=Base()
576
  )
577
 
578
  interface.launch()