Spaces:

ProfessorLeVesseur
/

PDF_Topic_Extraction_Analysis_App

Running

App Files Files Community

ProfessorLeVesseur commited on Apr 10

Commit

0d32d1f

verified ·

1 Parent(s): 3670dab

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -24

app.py CHANGED Viewed

@@ -301,12 +301,10 @@ for key in ['pdf_processed', 'markdown_texts', 'df']:
 # ---------------------------------------------------------------------------------------
 # API Configuration
 # ---------------------------------------------------------------------------------------
-# Retrieve Hugging Face API key from environment variables
 hf_api_key = os.getenv('HF_API_KEY')
 if not hf_api_key:
     raise ValueError("HF_API_KEY not set in environment variables")
-# Create the Hugging Face inference client
 client = InferenceClient(api_key=hf_api_key)
 # ---------------------------------------------------------------------------------------
@@ -321,8 +319,8 @@ class SurveyAnalysis:
 Instructions:
 - Extract exact quotes per topic.
 - Ignore irrelevant topics.
-Format:
 [Topic]
 - "Exact quote"
@@ -331,32 +329,31 @@ Meeting Notes:
 """
     def prompt_response_from_hf_llm(self, llm_input):
-        # Define a system prompt to guide the model's responses
         system_prompt = """
-        <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona>
-        <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task>
-        <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context>
-        <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format>
         """
-        # Generate the refined prompt using Hugging Face API
         response = client.chat.completions.create(
             model="meta-llama/Llama-3.1-70B-Instruct",
             messages=[
-                {"role": "system", "content": system_prompt},  # Add system prompt here
                 {"role": "user", "content": llm_input}
             ],
-            stream=True,
             temperature=0.5,
             max_tokens=1024,
             top_p=0.7
         )
-        # Combine messages if response is streamed
-        response_content = ""
-        for message in response:
-            response_content += message.choices[0].delta.content
         return response_content.strip()
     def extract_text(self, response):
@@ -367,7 +364,6 @@ Meeting Notes:
         for _, row in df.iterrows():
             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
             response = self.prompt_response_from_hf_llm(llm_input)
-            print("AI Response:", response)  # Debugging: print the AI response
             notes = self.extract_text(response)
             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
@@ -408,17 +404,24 @@ def extract_markdown_from_image(image):
     doc.load_from_doctags(doctags_doc)
     return doc.export_to_markdown()
 def extract_excerpts(processed_df):
     rows = []
     for _, r in processed_df.iterrows():
-        for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
-            topic_match = re.match(r'\[([^\]]+)\]', sec)
             if topic_match:
-                topic = topic_match.group(1)
-                excerpts = re.findall(r'- "([^"]+)"', sec)
                 for excerpt in excerpts:
-                    rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
-    print("Extracted Rows:", rows)  # Debugging: print extracted rows
     return pd.DataFrame(rows)
 # ---------------------------------------------------------------------------------------

 # ---------------------------------------------------------------------------------------
 # API Configuration
 # ---------------------------------------------------------------------------------------
 hf_api_key = os.getenv('HF_API_KEY')
 if not hf_api_key:
     raise ValueError("HF_API_KEY not set in environment variables")
 client = InferenceClient(api_key=hf_api_key)
 # ---------------------------------------------------------------------------------------
 Instructions:
 - Extract exact quotes per topic.
 - Ignore irrelevant topics.
+- Strictly follow this format:
 [Topic]
 - "Exact quote"
 """
     def prompt_response_from_hf_llm(self, llm_input):
         system_prompt = """
+        You are an expert assistant tasked with extracting exact quotes from provided meeting notes based on given topics.
+        Instructions:
+        - Only extract exact quotes relevant to provided topics.
+        - Ignore irrelevant content.
+        - Strictly follow this format:
+        [Topic]
+        - "Exact quote"
         """
         response = client.chat.completions.create(
             model="meta-llama/Llama-3.1-70B-Instruct",
             messages=[
+                {"role": "system", "content": system_prompt},
                 {"role": "user", "content": llm_input}
             ],
             temperature=0.5,
             max_tokens=1024,
             top_p=0.7
         )
+        response_content = response.choices[0].message.content
+        print("Full AI Response:", response_content)  # Debugging
         return response_content.strip()
     def extract_text(self, response):
         for _, row in df.iterrows():
             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
             response = self.prompt_response_from_hf_llm(llm_input)
             notes = self.extract_text(response)
             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
     doc.load_from_doctags(doctags_doc)
     return doc.export_to_markdown()
+# Revised extract_excerpts function with improved robustness
 def extract_excerpts(processed_df):
     rows = []
     for _, r in processed_df.iterrows():
+        sections = re.split(r'\n(?=(?:\*\*|\[)?[A-Za-z/ ]+(?:\*\*|\])?\n- )', r['Topic_Summary'])
+        for sec in sections:
+            topic_match = re.match(r'(?:\*\*|\[)?([A-Za-z/ ]+)(?:\*\*|\])?', sec.strip())
             if topic_match:
+                topic = topic_match.group(1).strip()
+                excerpts = re.findall(r'- "?([^"\n]+)"?', sec)
                 for excerpt in excerpts:
+                    rows.append({
+                        'Document_Text': r['Document_Text'],
+                        'Topic_Summary': r['Topic_Summary'],
+                        'Excerpt': excerpt.strip(),
+                        'Topic': topic
+                    })
+    print("Extracted Rows:", rows)  # Debugging
     return pd.DataFrame(rows)
 # ---------------------------------------------------------------------------------------