Alioth86 commited on
Commit
50f128f
·
1 Parent(s): 421c363

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -160
app.py DELETED
@@ -1,160 +0,0 @@
1
- import PyPDF2
2
- import pdfplumber
3
- from pdfminer.high_level import extract_pages, extract_text
4
- from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
5
- import re
6
- import torch
7
- import transformers
8
- from transformers import pipeline
9
- from datasets import load_dataset
10
- import soundfile as sf
11
- from IPython.display import Audio
12
- from datasets import load_dataset
13
- import sentencepiece as spm
14
- import os
15
- import tempfile
16
- import gradio as gr
17
-
18
- description = """**SpeechAbstractor**\n
19
- This app enables users to upload academic articles in PDF format, specifically focusing on abstracts.
20
- It efficiently summarizes the abstract and provides an audio playback of the summarized content.
21
- Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!"""
22
-
23
- examples = [
24
- ["Article_7.pdf"],["Article_9.pdf"],["Article_11.pdf"]
25
- ]
26
-
27
- #reporting the functions created for the part 1
28
- def text_extraction(element):
29
- line_text = element.get_text()
30
-
31
- line_formats = []
32
- for text_line in element:
33
- if isinstance(text_line, LTTextContainer):
34
- for character in text_line:
35
- if isinstance(character, LTChar):
36
- line_formats.append(character.fontname)
37
- line_formats.append(character.size)
38
- format_per_line = list(set(line_formats))
39
-
40
- return (line_text, format_per_line)
41
-
42
- def read_pdf(pdf_pathy):
43
- pdfFileObj = open(pdf_pathy, 'rb')
44
- pdfReaded = PyPDF2.PdfReader(pdfFileObj)
45
-
46
- text_per_pagy = {}
47
- for pagenum, page in enumerate(extract_pages(pdf_pathy)):
48
- print("Elaborating Page_" +str(pagenum))
49
- pageObj = pdfReaded.pages[pagenum]
50
- page_text = []
51
- line_format = []
52
- page_content = []
53
-
54
- pdf = pdfplumber.open(pdf_pathy)
55
-
56
- page_elements = [(element.y1, element) for element in page._objs]
57
- page_elements.sort(key=lambda a: a[0], reverse=True)
58
-
59
- for i,component in enumerate(page_elements):
60
- pos= component[0]
61
- element = component[1]
62
-
63
- if isinstance(element, LTTextContainer):
64
- (line_text, format_per_line) = text_extraction(element)
65
- page_text.append(line_text)
66
- line_format.append(format_per_line)
67
- page_content.append(line_text)
68
-
69
-
70
- dctkey = 'Page_'+str(pagenum)
71
- text_per_pagy[dctkey]= [page_text, line_format, page_content]
72
-
73
- pdfFileObj.close()
74
-
75
-
76
- return text_per_pagy
77
-
78
-
79
- def clean_text(text):
80
- # remove extra spaces
81
- text = re.sub(r'\s+', ' ', text)
82
-
83
- return text.strip()
84
-
85
-
86
- def extract_abstract(text_per_pagy):
87
- abstract_text = ""
88
-
89
- for page_num, page_text in text_per_pagy.items():
90
- if page_text:
91
- page_text = page_text.replace("- ", "")
92
-
93
- start_index = page_text.find("Abstract")
94
- if start_index != -1:
95
- start_index += len("Abstract") + 1
96
-
97
- end_markers = ["Introduction", "Summary", "Overview", "Background", "Contents"]
98
- end_index = -1
99
-
100
- for marker in end_markers:
101
- temp_index = page_text.find(marker, start_index)
102
- if temp_index != -1:
103
- end_index = temp_index
104
- break
105
-
106
- if end_index == -1:
107
- end_index = len(page_text)
108
-
109
- abstract = page_text[start_index:end_index].strip()
110
-
111
- abstract_text += " " + abstract
112
-
113
- break
114
-
115
- return abstract_text
116
-
117
- #let's define a main function that gets the uploaded file (pdf) to do the job
118
- def main_function(uploaded_filepath):
119
- #put a control to see if there is a file uploaded
120
- if uploaded_filepath is None:
121
- return "No file loaded", None
122
-
123
- #read and process the file according to read_pdf
124
- text_per_pagy = read_pdf(uploaded_filepath)
125
-
126
- #cleaning the text and getting the abstract using the 2 other functions
127
- for key, value in text_per_pagy.items():
128
- cleaned_text = clean_text(' '.join(value[0]))
129
- text_per_pagy[key] = cleaned_text
130
- abstract_text = extract_abstract(text_per_pagy)
131
-
132
- #abstract the summary with my pipeline and model, deciding the length
133
- summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
134
- summary = summarizer(abstract_text, max_length=65, do_sample=False)[0]['summary_text']
135
-
136
- #generating the audio from the text, with my pipeline and model
137
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
138
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
139
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
140
- speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding})
141
-
142
- #saving the audio in a temporary file
143
- audio_file_path = "summary.wav"
144
- sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])
145
-
146
- #the function returns the 2 pieces we need
147
- return summary, audio_file_path
148
-
149
- #let's communicate with gradio what it has to put in
150
- iface = gr.Interface(
151
- fn=main_function,
152
- inputs=gr.File(type="filepath"),
153
- outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")],
154
- description=description,
155
- examples=examples
156
- )
157
-
158
- #launching the app
159
- if __name__ == "__main__":
160
- iface.launch()