MikeMai commited on
Commit
5be131b
·
verified ·
1 Parent(s): 1cc3a10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -16
app.py CHANGED
@@ -312,7 +312,7 @@ def extract_text_outside_tables(root, table_paragraphs):
312
  continue # Skip paragraphs inside tables
313
 
314
  texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
315
- line = clean_spaces(' '.join(texts).replace(';', '').replace(';','').replace(':',':')) # Remove semicolons and clean spaces
316
 
317
  if ':' in line:
318
  extracted_text.append(line)
@@ -334,6 +334,7 @@ def extract_docx_as_xml(file_bytes, save_xml=False, xml_filename="document.xml")
334
  f.write(xml_content)
335
  return xml_content
336
 
 
337
  def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
338
 
339
  tree = ET.ElementTree(ET.fromstring(xml_content))
@@ -349,7 +350,8 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
349
 
350
  return json.dumps(extracted_data, ensure_ascii=False, indent=4)
351
 
352
- def deepseek_extract_contract_summary(json_data, save_json=False):
 
353
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
354
 
355
  # Step 1: Convert JSON string to Python dictionary
@@ -359,24 +361,24 @@ def deepseek_extract_contract_summary(json_data, save_json=False):
359
  filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
360
 
361
  # Step 3: Convert back to JSON string (if needed)
362
- json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
363
 
364
  prompt = """You are given a contract in JSON format. Extract the following information:
365
 
366
  # Response Format
367
- Return the extracted information as a structured JSON in the exact format shown below (Do not repeat any keys):
368
 
369
  {
370
  "合同编号":
371
- "采购经办人":
372
- "接收人":
373
  "Recipient":
374
- "接收地":
375
- "Place of receipt":
376
  "供应商":
377
- "币种":
378
- "合同日期":
379
- "供货日期":
380
  }
381
 
382
  Contract data in JSON format:""" + f"""
@@ -389,6 +391,7 @@ Contract data in JSON format:""" + f"""
389
  }
390
  ]
391
 
 
392
  client = OpenAI(
393
  base_url="https://router.huggingface.co/novita",
394
  api_key=HF_API_KEY,
@@ -397,18 +400,48 @@ Contract data in JSON format:""" + f"""
397
  completion = client.chat.completions.create(
398
  model="deepseek/deepseek-r1-distill-qwen-14b",
399
  messages=messages,
 
400
  )
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
403
 
404
  contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
405
 
406
  if save_json:
407
- with open("extracted_contract_summary.json", "w", encoding="utf-8") as f:
408
  f.write(contract_summary)
409
 
410
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
411
 
 
412
  def deepseek_extract_price_list(json_data):
413
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
414
 
@@ -486,17 +519,20 @@ def extract_po(docx_path):
486
 
487
  # Step 1: Extract XML content from DOCX
488
  print("Extracting Docs data to XML...")
489
- xml_file = extract_docx_as_xml(docx_bytes,save_xml=True)
 
490
 
491
  get_namespace(ET.fromstring(xml_file))
492
 
493
  # Step 2: Extract tables from DOCX and save JSON
494
  print("Extracting XML data to JSON...")
495
- extracted_data = xml_to_json(xml_file, save_json=True)
 
496
 
497
  # Step 2: Process JSON with OpenAI to get structured output
498
  print("Processing JSON data with AI...")
499
- contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True)
 
500
 
501
  # Step 3: Save formatted data as Excel
502
  print("Converting AI Generated JSON to Excel...")
@@ -541,4 +577,4 @@ interface = gr.Interface(
541
  theme=Base()
542
  )
543
 
544
- interface.launch()
 
312
  continue # Skip paragraphs inside tables
313
 
314
  texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
315
+ line = clean_spaces(' '.join(texts).replace(':',':')) # Clean colons and spaces
316
 
317
  if ':' in line:
318
  extracted_text.append(line)
 
334
  f.write(xml_content)
335
  return xml_content
336
 
337
+
338
  def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
339
 
340
  tree = ET.ElementTree(ET.fromstring(xml_content))
 
350
 
351
  return json.dumps(extracted_data, ensure_ascii=False, indent=4)
352
 
353
+
354
+ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
355
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
356
 
357
  # Step 1: Convert JSON string to Python dictionary
 
361
  filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
362
 
363
  # Step 3: Convert back to JSON string (if needed)
364
+ json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
365
 
366
  prompt = """You are given a contract in JSON format. Extract the following information:
367
 
368
  # Response Format
369
+ Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
370
 
371
  {
372
  "合同编号":
373
+ "采购经办人": (注意:不是买家必须是采购经办人,不是一个公司而是一个人)
374
+ "接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
375
  "Recipient":
376
+ "接收地": (注意:不是交货地点是目的港)
377
+ "Place of receipt": (如果接收地有英文填在这里)
378
  "供应商":
379
+ "币种": (主要用的货币,填英文缩写。GNF一般是为了方便而转换出来的, 除非只有GNF,GNF一般不是主要币种。)
380
+ "合同日期":
381
+ "供货日期": 必须是一个日期,而不是天数
382
  }
383
 
384
  Contract data in JSON format:""" + f"""
 
391
  }
392
  ]
393
 
394
+ # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
395
  client = OpenAI(
396
  base_url="https://router.huggingface.co/novita",
397
  api_key=HF_API_KEY,
 
400
  completion = client.chat.completions.create(
401
  model="deepseek/deepseek-r1-distill-qwen-14b",
402
  messages=messages,
403
+ temperature=0.5,
404
  )
405
 
406
+ # Deepseek V3 --------------------------------
407
+ # client = OpenAI(
408
+ # base_url="https://router.huggingface.co/novita",
409
+ # api_key=HF_API_KEY,
410
+ # )
411
+
412
+ # completion = client.chat.completions.create(
413
+ # model="deepseek/deepseek_v3",
414
+ # messages=messages,
415
+ # temperature=0.1,
416
+ # )
417
+
418
+ # Qwen 2.5 7B --------------------------------
419
+ # client = OpenAI(
420
+ # base_url="https://router.huggingface.co/together",
421
+ # api_key=HF_API_KEY,
422
+ # )
423
+
424
+ # completion = client.chat.completions.create(
425
+ # model="Qwen/Qwen2.5-7B-Instruct-Turbo",
426
+ # messages=messages,
427
+ # )
428
+
429
+ think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
430
+ if think_text:
431
+ print(f"Thought Process: {think_text}")
432
+ logging.info(f"Think text: {think_text}")
433
+
434
  contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
435
 
436
  contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
437
 
438
  if save_json:
439
+ with open(json_filename, "w", encoding="utf-8") as f:
440
  f.write(contract_summary)
441
 
442
  return json.dumps(contract_summary, ensure_ascii=False, indent=4)
443
 
444
+
445
  def deepseek_extract_price_list(json_data):
446
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
447
 
 
519
 
520
  # Step 1: Extract XML content from DOCX
521
  print("Extracting Docs data to XML...")
522
+ xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
523
+ xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
524
 
525
  get_namespace(ET.fromstring(xml_file))
526
 
527
  # Step 2: Extract tables from DOCX and save JSON
528
  print("Extracting XML data to JSON...")
529
+ json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
530
+ extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
531
 
532
  # Step 2: Process JSON with OpenAI to get structured output
533
  print("Processing JSON data with AI...")
534
+ contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
535
+ contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
536
 
537
  # Step 3: Save formatted data as Excel
538
  print("Converting AI Generated JSON to Excel...")
 
577
  theme=Base()
578
  )
579
 
580
+ interface.launch()