Spaces:

MikeMai
/

PO_Extractor

Sleeping

App Files Files Community

MikeMai commited on Apr 14

Commit

20e903b

verified ·

1 Parent(s): 4ad44cc

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -76

app.py CHANGED Viewed

@@ -16,6 +16,9 @@ import re
 import logging
 HF_API_KEY = os.getenv("HF_API_KEY")
@@ -247,7 +250,28 @@ def process_long_table(rows):
         table_data.append(row_data)
-    return table_data
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
@@ -402,29 +426,6 @@ Contract data in JSON format:""" + f"""
         temperature=0.5,
     )
-    # Deepseek V3 --------------------------------
-    # client = OpenAI(
-    #     base_url="https://router.huggingface.co/novita",
-    #     api_key=HF_API_KEY,
-    # )
-    # completion = client.chat.completions.create(
-    #     model="deepseek/deepseek_v3",
-    #     messages=messages,
-    #     temperature=0.1,
-    # )
-    # Qwen 2.5 7B --------------------------------
-    # client = OpenAI(
-    #     base_url="https://router.huggingface.co/together",
-    #     api_key=HF_API_KEY,
-    # )
-    # completion = client.chat.completions.create(
-    #     model="Qwen/Qwen2.5-7B-Instruct-Turbo",
-    #     messages=messages,
-    # )
     think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
     if think_text:
         print(f"Thought Process: {think_text}")
@@ -441,50 +442,110 @@ Contract data in JSON format:""" + f"""
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
-def deepseek_extract_price_list(json_data):
-    """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
-    # Step 1: Convert JSON string to Python dictionary
-    contract_data = json.loads(json_data)
-    # Step 2: Remove keys that contain "long_table"
-    filtered_contract_data = {key: value for key, value in contract_data.items() if "long_table" in key}
-    # Step 3: Convert back to JSON string (if needed)
-    json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
-    prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
-# Response Format
-Return the extracted information as a CSV in the exact format shown below:
-物料名称, 物料名称(英文), 物料规格, 采购数量, 单位, 单价, 计划号
-JSON data:""" + f"""
-{json_output}"""
-    messages = [
-        {
             "role": "user",
-            "content": prompt
-        }
-    ]
-    client = OpenAI(
-	    base_url="https://router.huggingface.co/novita",
-	    api_key=HF_API_KEY,
-    )
-    completion = client.chat.completions.create(
-        model="deepseek/deepseek-r1-distill-qwen-14b",
-        messages=messages,
-    )
-    price_list = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL)
-    price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
@@ -507,7 +568,7 @@ def json_to_excel(contract_summary, json_data, excel_path):
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
-    """Processes a single .docx file, extracts tables, formats with OpenAI, and saves as an Excel file."""
     if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
         raise ValueError(f"Invalid file: {docx_path}")
@@ -518,28 +579,42 @@ def extract_po(docx_path):
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
     xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
-    xml_file = extract_docx_as_xml(docx_bytes, save_xml=True, xml_filename=xml_filename)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
-    extracted_data = xml_to_json(xml_file, save_json=True, json_filename=json_filename)
-    # Step 2: Process JSON with OpenAI to get structured output
-    print("Processing JSON data with AI...")
     contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
-    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=True, json_filename=contract_summary_filename)
-    # Step 3: Save formatted data as Excel
-    print("Converting AI Generated JSON to Excel...")
-    excel_output_path = os.path.splitext(docx_path)[0] + ".xlsx"
-    json_to_excel(contract_summary, extracted_data, excel_output_path)
-    print(f"Excel file saved at: {excel_output_path}")
     # Logging
     log = f"""Results:
@@ -547,20 +622,20 @@ def extract_po(docx_path):
     RAW Extracted Data: {extracted_data},
-    XML Preview: {xml_file[:1000]}"""
     print(log)
     logging.info(f"""{log}""")
-    return excel_output_path
 # Example Usage
 # extract_po("test-contract-converted.docx")
 # extract_po("test-contract.docx")
 # Gradio Interface ------------------------------
 import gradio as gr
@@ -570,9 +645,10 @@ interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",
     inputs=gr.File(label="买卖合同 （.docx）"),
-    outputs=gr.File(label="数据提取结果 （.xlsx）"),
     flagging_mode="never",
     theme=Base()
 )
 interface.launch()

 import logging
+from pydantic import BaseModel, Field, ValidationError, RootModel
+from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
         table_data.append(row_data)
+    # Filter out rows where the "序号" column contains non-numeric values
+    filtered_table_data = []
+    for row in table_data:
+        # Check potential serial number columns (use both Chinese and English variants)
+        serial_number = None
+        for column in row:
+            if any(term in column for term in ["序号"]):
+                serial_number = row[column]
+                break
+        # If we found a serial number column, check if its value is numeric
+        if serial_number is not None:
+            # Strip any non-numeric characters and check if there's still a value
+            # This keeps values like "1", "2." etc. but filters out "No." or other text
+            cleaned_number = re.sub(r'[^\d]', '', serial_number)
+            if cleaned_number:  # If there are any digits left, keep the row
+                filtered_table_data.append(row)
+        else:
+            # If we couldn't find a serial number column, keep the row
+            filtered_table_data.append(row)
+    return filtered_table_data
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
         temperature=0.5,
     )
     think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
     if think_text:
         print(f"Thought Process: {think_text}")
     return json.dumps(contract_summary, ensure_ascii=False, indent=4)
+def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
+    """
+    Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
+    Retries up to 3 times with error feedback if output is not valid JSON.
+    """
+    # Pydantic schema
+    class PriceItem(BaseModel):
+        序号: str
+        名称: str
+        名称_英文: str = Field(..., alias="名称(英文)")
+        品牌: str
+        规格: str
+        所属机型: str
+        采购数量: str
+        单位: str
+        单价: str
+        总价: str
+        几郎单价: str
+        几郎总额: str
+        备注: str
+        计划来源: str
+        其他: dict = Field(default_factory=dict, alias="其他")
+    class PriceListModel(BaseModel):
+        items: List[PriceItem]
+    base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
+有时候第一行是表头，有时候是数据行，只输入数据行。请注意，输出的 JSON 需要符合以下格式要求：
+# 输出格式要求：
+每个条目输出以下字段：
+- 序号
+- 名称：只填中文
+- 名称(英文)：只填英文
+- 品牌
+- 规格
+- 所属机型
+- 采购数量
+- 单位
+- 单价: 只填数字
+- 总价: 只填数字
+- 几郎单价: 只填数字
+- 几郎总额: 只填数字
+- 备注
+- 计划来源
+- 其他：如果有以上以外的字段就以list的形式写在其他里 （"其他": "key1": "value1", "key2":"value2"）,如果没有就给一个空的list
+请确保输出的 JSON 是有效的，且字段名称与输入的字段名称一致。请注意，字段名称可能会有不同的拼写方式，请根据上下文进行判断。
+请确保输出的条目数量与输入的列表数量一致。
+# 原始价格表：
+{price_list}"""
+    messages = [{"role": "user", "content": base_prompt}]
+    client = OpenAI(
+        base_url="https://router.huggingface.co/novita",
+        api_key=HF_API_KEY,
+    )
+    for attempt in range(3):
+        print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
+        try:
+            response = client.chat.completions.create(
+                model="deepseek/deepseek-r1-distill-qwen-14b",
+                messages=messages,
+            )
+            raw = response.choices[0].message.content
+            # Strip out LLM artifacts
+            raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
+            raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
+            # Wrap the raw JSON in a proper structure if it's a list
+            if raw.strip().startswith('['):
+                raw = '{"items": ' + raw + '}'
+            validated = PriceListModel.model_validate_json(raw)
+            price_list_json = validated.model_dump(by_alias=True)["items"]
+            if save_json:
+                with open(json_name, "w", encoding="utf-8") as f:
+                    json.dump(price_list_json, f, ensure_ascii=False, indent=4)
+                print(f"✅ Saved to {json_name}")
+            return price_list_json
+        except ValidationError as ve:
+            error_msg = f"Pydantic validation error: {ve}"
+        except Exception as e:
+            error_msg = f"Unexpected error: {e}"
+        print(f"❌ {error_msg}")
+        messages.append({
             "role": "user",
+            "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
+        })
+    print("⚠️ Failed after 3 attempts.")
+    return raw
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
+    """Processes a single .docx file, extracts tables, formats with OpenAI, and returns combined JSON data."""
     if not os.path.exists(docx_path) or not docx_path.endswith(".docx"):
         raise ValueError(f"Invalid file: {docx_path}")
     # Step 1: Extract XML content from DOCX
     print("Extracting Docs data to XML...")
     xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
+    xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
     get_namespace(ET.fromstring(xml_file))
     # Step 2: Extract tables from DOCX and save JSON
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
+    extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
+    # Step 3: Process JSON with OpenAI to get structured output
+    print("Processing Contract Summary data with AI...")
     contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
+    contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=False, json_filename=contract_summary_filename)
+    # Find the last long table (excluding summary tables)
+    print("Processing Price List data with AI...")
+    long_tables = [
+        table for key, table in json.loads(extracted_data).items()
+        if "long_table" in key and "summary" not in key
+    ]
+    last_long_table = long_tables[-1] if long_tables else {}
+    # Generate the price list filename in the same folder as the document
+    price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
+    # Process the price list and save it to a JSON file
+    price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
+    # Step 4: Combine contract summary and long table data into a single JSON object
+    print("Combining AI Generated JSON with Extracted Data...")
+    combined_data = {
+        "contract_summary": json.loads(json.loads(contract_summary)),
+        "price_list": price_list
+    }
     # Logging
     log = f"""Results:
     RAW Extracted Data: {extracted_data},
+    Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
     print(log)
     logging.info(f"""{log}""")
+    return combined_data
 # Example Usage
 # extract_po("test-contract-converted.docx")
 # extract_po("test-contract.docx")
+# print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------
 import gradio as gr
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",
     inputs=gr.File(label="买卖合同 （.docx）"),
+    outputs=gr.Json(label="提取结果"),
     flagging_mode="never",
     theme=Base()
 )
 interface.launch()