Spaces:

MikeMai
/

PO_Extractor

Sleeping

App Files Files Community

MikeMai commited on Mar 12

Commit

5be131b

verified ·

1 Parent(s): 1cc3a10

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -16

app.py CHANGED Viewed

@@ -312,7 +312,7 @@ def extract_text_outside_tables(root, table_paragraphs):
             continue  # Skip paragraphs inside tables
         texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
-        line = clean_spaces(' '.join(texts).replace(';', '').replace('；','').replace('：',':'))  # Remove semicolons and clean spaces
         if ':' in line:
             extracted_text.append(line)
@@ -334,6 +334,7 @@ def extract_docx_as_xml(file_bytes, save_xml=False, xml_filename="document.xml")
                     f.write(xml_content)
             return xml_content
 def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
     tree = ET.ElementTree(ET.fromstring(xml_content))
@@ -349,7 +350,8 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
     return json.dumps(extracted_data, ensure_ascii=False, indent=4)
-def deepseek_extract_contract_summary(json_data, save_json=False):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
     # Step 1: Convert JSON string to Python dictionary
@@ -359,24 +361,24 @@ def deepseek_extract_contract_summary(json_data, save_json=False):
     filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
     # Step 3: Convert back to JSON string (if needed)
-    json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
     prompt = """You are given a contract in JSON format. Extract the following information:
 # Response Format
-Return the extracted information as a structured JSON in the exact format shown below (Do not repeat any keys):
 {
     "合同编号":
-    "采购经办人":
-    "接收人":
     "Recipient":
-    "接收地":
-    "Place of receipt":
     "供应商":
-    "币种":
-    "合同日期":
-    "供货日期":
 }
 Contract data in JSON format:""" + f"""
@@ -389,6 +391,7 @@ Contract data in JSON format:""" + f"""
         }
     ]
     client = OpenAI(
         base_url="https://router.huggingface.co/novita",
         api_key=HF_API_KEY,
@@ -397,18 +400,48 @@ Contract data in JSON format:""" + f"""
     completion = client.chat.completions.create(
         model="deepseek/deepseek-r1-distill-qwen-14b",
         messages=messages,
     )
     contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
     contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
     if save_json:
-        with open("extracted_contract_summary.json", "w", encoding="utf-8") as f:
             f.write(contract_summary)
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
 def deepseek_extract_price_list(json_data):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
@@ -486,17 +519,20 @@ def extract_po(docx_path):
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
-    xml_file = extract_docx_as_xml(docx_bytes,save_xml=True)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
-    extracted_data = xml_to_json(xml_file, save_json=True)
     # Step 2: Process JSON with OpenAI to get structured output
     print("Processing JSON data with AI...")
-    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True)
     # Step 3: Save formatted data as Excel
     print("Converting AI Generated JSON to Excel...")
@@ -541,4 +577,4 @@ interface = gr.Interface(
     theme=Base()
 )
-interface.launch()

             continue  # Skip paragraphs inside tables
         texts = [t.text.strip() for t in paragraph.findall('.//w:t', NS) if t.text]
+        line = clean_spaces(' '.join(texts).replace('：',':'))  # Clean colons and spaces
         if ':' in line:
             extracted_text.append(line)
                     f.write(xml_content)
             return xml_content
 def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json"):
     tree = ET.ElementTree(ET.fromstring(xml_content))
     return json.dumps(extracted_data, ensure_ascii=False, indent=4)
+def deepseek_extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
     # Step 1: Convert JSON string to Python dictionary
     filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" not in key}
     # Step 3: Convert back to JSON string (if needed)
+    json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
     prompt = """You are given a contract in JSON format. Extract the following information:
 # Response Format
+Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
 {
     "合同编号":
+    "采购经办人": （注意：不是买家必须是采购经办人，不是一个公司而是一个人）
+    "接收人": （注意：不是买家必须是接收人，不是一个公司而是一个人）
     "Recipient":
+    "接收地": （注意：不是交货地点是目的港）
+    "Place of receipt": （如果接收地有英文填在这里）
     "供应商":
+    "币种": （主要用的货币，填英文缩写。GNF一般是为了方便而转换出来的, 除非只有GNF，GNF一般不是主要币种。）
+    "合同日期":
+    "供货日期": 必须是一个日期，而不是天数
 }
 Contract data in JSON format:""" + f"""
         }
     ]
+    # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
     client = OpenAI(
         base_url="https://router.huggingface.co/novita",
         api_key=HF_API_KEY,
     completion = client.chat.completions.create(
         model="deepseek/deepseek-r1-distill-qwen-14b",
         messages=messages,
+        temperature=0.5,
     )
+    # Deepseek V3 --------------------------------
+    # client = OpenAI(
+    #     base_url="https://router.huggingface.co/novita",
+    #     api_key=HF_API_KEY,
+    # )
+    # completion = client.chat.completions.create(
+    #     model="deepseek/deepseek_v3",
+    #     messages=messages,
+    #     temperature=0.1,
+    # )
+    # Qwen 2.5 7B --------------------------------
+    # client = OpenAI(
+    #     base_url="https://router.huggingface.co/together",
+    #     api_key=HF_API_KEY,
+    # )
+    # completion = client.chat.completions.create(
+    #     model="Qwen/Qwen2.5-7B-Instruct-Turbo",
+    #     messages=messages,
+    # )
+    think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
+    if think_text:
+        print(f"Thought Process: {think_text}")
+        logging.info(f"Think text: {think_text}")
     contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
     contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
     if save_json:
+        with open(json_filename, "w", encoding="utf-8") as f:
             f.write(contract_summary)
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
 def deepseek_extract_price_list(json_data):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
+    xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
+    xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
+    json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
+    extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
     # Step 2: Process JSON with OpenAI to get structured output
     print("Processing JSON data with AI...")
+    contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
+    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
     # Step 3: Save formatted data as Excel
     print("Converting AI Generated JSON to Excel...")
     theme=Base()
 )
+interface.launch()