Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -34,9 +34,9 @@ def get_namespace(root):
|
|
34 |
"""Extracts the primary namespace from the XML root element while keeping the default."""
|
35 |
global NS
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
return NS
|
41 |
|
42 |
# --- Helper Functions for DOCX Processing ---
|
@@ -307,7 +307,6 @@ def extract_text_outside_tables(root, table_paragraphs):
|
|
307 |
"""Extracts text from paragraphs outside tables in the document."""
|
308 |
extracted_text = []
|
309 |
|
310 |
-
# print(ET.tostring(root, encoding='unicode'))
|
311 |
for paragraph in root.findall('.//w:p', NS):
|
312 |
if paragraph in table_paragraphs:
|
313 |
continue # Skip paragraphs inside tables
|
@@ -422,8 +421,6 @@ def deepseek_extract_price_list(json_data):
|
|
422 |
# Step 3: Convert back to JSON string (if needed)
|
423 |
json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
|
424 |
|
425 |
-
print(json_output)
|
426 |
-
|
427 |
prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
|
428 |
|
429 |
# Response Format
|
@@ -455,7 +452,6 @@ JSON data:""" + f"""
|
|
455 |
|
456 |
price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
|
457 |
|
458 |
-
print(price_list)
|
459 |
|
460 |
def json_to_excel(contract_summary, json_data, excel_path):
|
461 |
"""Converts extracted JSON tables to an Excel file."""
|
@@ -463,8 +459,6 @@ def json_to_excel(contract_summary, json_data, excel_path):
|
|
463 |
# Correctly parse the JSON string
|
464 |
contract_summary_json = json.loads(json.loads(contract_summary))
|
465 |
|
466 |
-
print(contract_summary_json)
|
467 |
-
|
468 |
contract_summary_df = pd.DataFrame([contract_summary_json])
|
469 |
|
470 |
# Ensure json_data is a dictionary
|
@@ -547,4 +541,4 @@ interface = gr.Interface(
|
|
547 |
theme=Base()
|
548 |
)
|
549 |
|
550 |
-
interface.launch()
|
|
|
34 |
"""Extracts the primary namespace from the XML root element while keeping the default."""
|
35 |
global NS
|
36 |
|
37 |
+
ns = root.tag.split('}')[0].strip('{')
|
38 |
+
NS = {'w': ns} if ns else DEFAULT_NS
|
39 |
+
|
40 |
return NS
|
41 |
|
42 |
# --- Helper Functions for DOCX Processing ---
|
|
|
307 |
"""Extracts text from paragraphs outside tables in the document."""
|
308 |
extracted_text = []
|
309 |
|
|
|
310 |
for paragraph in root.findall('.//w:p', NS):
|
311 |
if paragraph in table_paragraphs:
|
312 |
continue # Skip paragraphs inside tables
|
|
|
421 |
# Step 3: Convert back to JSON string (if needed)
|
422 |
json_output = json.dumps(filtered_contract_data, ensure_ascii=False, indent=4)
|
423 |
|
|
|
|
|
424 |
prompt = """You are given a price list in JSON format. Extract the following information in CSV format:
|
425 |
|
426 |
# Response Format
|
|
|
452 |
|
453 |
price_list = re.sub(r"^```json\n|```$", "", price_list, flags=re.DOTALL)
|
454 |
|
|
|
455 |
|
456 |
def json_to_excel(contract_summary, json_data, excel_path):
|
457 |
"""Converts extracted JSON tables to an Excel file."""
|
|
|
459 |
# Correctly parse the JSON string
|
460 |
contract_summary_json = json.loads(json.loads(contract_summary))
|
461 |
|
|
|
|
|
462 |
contract_summary_df = pd.DataFrame([contract_summary_json])
|
463 |
|
464 |
# Ensure json_data is a dictionary
|
|
|
541 |
theme=Base()
|
542 |
)
|
543 |
|
544 |
+
interface.launch()
|