vishalsh13's picture
commit as its working on local
e4dba65
raw
history blame contribute delete
816 Bytes
import fitz # PyMuPDF
import pandas as pd
from pptx import Presentation
def extract_text_from_file(v_file_path):
"""
Extracts text from PDF, PPTX, or CSV files.
"""
v_text = ""
if v_file_path.lower().endswith('.pdf'):
obj_pdf = fitz.open(v_file_path)
for obj_page in obj_pdf:
v_text += obj_page.get_text()
obj_pdf.close()
elif v_file_path.lower().endswith('.pptx'):
obj_ppt = Presentation(v_file_path)
for obj_slide in obj_ppt.slides:
for obj_shape in obj_slide.shapes:
if obj_shape.has_text_frame:
v_text += obj_shape.text_frame.text + "\n"
elif v_file_path.lower().endswith('.csv'):
v_data = pd.read_csv(v_file_path)
v_text += v_data.to_string()
return v_text