wt002 commited on
Commit
1e33f11
·
verified ·
1 Parent(s): 3102ee4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -19
app.py CHANGED
@@ -8,7 +8,7 @@ import requests
8
  from typing import List, Dict, Union
9
  import pandas as pd
10
  import wikipediaapi
11
- import PyPDF
12
  from docx import Document
13
 
14
  load_dotenv()
@@ -72,26 +72,24 @@ class BasicAgent:
72
  page = self.wiki.page(query)
73
  return page.summary if page.exists() else "No Wikipedia page found"
74
 
75
- def process_document(self, file_path: str) -> str:
76
- """Extract text from PDF (works with PyPDF2 or pypdf)"""
77
- if not os.path.exists(file_path):
78
- return "File not found"
79
-
80
- if file_path.lower().endswith('.pdf'):
81
  try:
82
- # Try modern pypdf first
83
- from pypdf import PdfReader
84
- except ImportError:
85
- # Fallback to PyPDF2
86
  from PyPDF2 import PdfReader
87
-
88
- try:
89
- with open(file_path, 'rb') as f:
90
- reader = PdfReader(f)
91
- text = "\n".join([page.extract_text() for page in reader.pages])
92
- return text if text.strip() else "PDF has no extractable text"
93
- except Exception as e:
94
- return f"PDF processing error: {str(e)}"
 
95
 
96
 
97
  def __call__(self, query: str) -> str:
 
8
  from typing import List, Dict, Union
9
  import pandas as pd
10
  import wikipediaapi
11
+ from pypdf import PdfReader
12
  from docx import Document
13
 
14
  load_dotenv()
 
72
  page = self.wiki.page(query)
73
  return page.summary if page.exists() else "No Wikipedia page found"
74
 
75
+ def extract_pdf_text(file_path: str) -> str:
76
+ """Works with both pypdf and PyPDF2."""
77
+ try:
78
+ # Prefer 'pypdf' (newer)
79
+ from pypdf import PdfReader
80
+ except ImportError:
81
  try:
82
+ # Fallback to 'PyPDF2'
 
 
 
83
  from PyPDF2 import PdfReader
84
+ except ImportError:
85
+ return "Error: Install 'pypdf' or 'PyPDF2' first (pip install pypdf)."
86
+
87
+ try:
88
+ with open(file_path, 'rb') as f:
89
+ reader = PdfReader(f)
90
+ return "\n".join(page.extract_text() for page in reader.pages)
91
+ except Exception as e:
92
+ return f"Failed to read PDF: {str(e)}"
93
 
94
 
95
  def __call__(self, query: str) -> str: