File size: 5,510 Bytes
8f91beb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87cd971
f67e416
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def converti(path):
    import pip
    
    def install(package):
        if hasattr(pip, 'main'):
            pip.main(['install', package])
        else:
            pip._internal.main(['install', package])
    
    install('git+https://github.com/huggingface/transformers.git')
    install('datasets sentencepiece')
    install('PyPDF2')
    install('pdfminer.six')
    install('pdfplumber')
    install('poppler-utils')
    install('tesseract-ocr')
    install('libtesseract-dev')
    
    # To read the PDF
    import PyPDF2
    # To analyze the PDF layout and extract text
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    # To extract text from tables in PDF
    import pdfplumber
    # To remove the additional created files
    import os
    
    # Create a function to extract text
    
    def text_extraction(element):
        # Extracting the text from the in-line text element
        line_text = element.get_text()
    
        # Find the formats of the text
        # Initialize the list with all the formats that appeared in the line of text
        line_formats = []
        for text_line in element:
            if isinstance(text_line, LTTextContainer):
                # Iterating through each character in the line of text
                for character in text_line:
                    if isinstance(character, LTChar):
                        # Append the font name of the character
                        line_formats.append(character.fontname)
                        # Append the font size of the character
                        line_formats.append(character.size)
        # Find the unique font sizes and names in the line
        format_per_line = list(set(line_formats))
    
        # Return a tuple with the text in each line along with its format
        return (line_text, format_per_line)
    
    def read_pdf(pdf_path):
      # create a PDF file object
      pdfFileObj = open(pdf_path, 'rb')
      # create a PDF reader object
      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
    
      # Create the dictionary to extract text from each image
      text_per_page = {}
      # We extract the pages from the PDF
      for pagenum, page in enumerate(extract_pages(pdf_path)):
          print("Elaborating Page_" +str(pagenum))
          # Initialize the variables needed for the text extraction from the page
          pageObj = pdfReaded.pages[pagenum]
          page_text = []
          line_format = []
          text_from_images = []
          text_from_tables = []
          page_content = []
          # Initialize the number of the examined tables
          table_num = 0
          first_element= True
          table_extraction_flag= False
          # Open the pdf file
          pdf = pdfplumber.open(pdf_path)
          # Find the examined page
          page_tables = pdf.pages[pagenum]
          # Find the number of tables on the page
          tables = page_tables.find_tables()
    
    
          # Find all the elements
          page_elements = [(element.y1, element) for element in page._objs]
          # Sort all the elements as they appear in the page
          page_elements.sort(key=lambda a: a[0], reverse=True)
    
          # Find the elements that composed a page
          for i,component in enumerate(page_elements):
              # Extract the position of the top side of the element in the PDF
              pos= component[0]
              # Extract the element of the page layout
              element = component[1]
    
              # Check if the element is a text element
              if isinstance(element, LTTextContainer):
                  # Check if the text appeared in a table
                  if table_extraction_flag == False:
                      # Use the function to extract the text and format for each text element
                      (line_text, format_per_line) = text_extraction(element)
                      # Append the text of each line to the page text
                      page_text.append(line_text)
                      # Append the format for each line containing text
                      line_format.append(format_per_line)
                      page_content.append(line_text)
                  else:
                      # Omit the text that appeared in a table
                      pass
    
          # Create the key of the dictionary
          dctkey = 'Page_'+str(pagenum)
          # Add the list of list as the value of the page key
          text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
    
      # Closing the pdf file object
      pdfFileObj.close()
    
      return text_per_page
    
    from google.colab import drive
    drive.mount('/content/drive')
    
    pdf_path = '/content/drive/MyDrive/' + path
    text_per_page = read_pdf(pdf_path)
    
    abstr = ''
    while len(abstr) == 0:
      for par in range(len(text_per_page)):
        for x in text_per_page['Page_'+str(par)]:
          mystring = ' '.join(map(str,x))
          if mystring.find('Abstract\n') > 0:
            abstr0 = mystring[mystring.find('Abstract\n')+10:]
            abstr = abstr0[:abstr0.find('1\n')]
    print(abstr)
    
    from transformers import pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(abstr, max_length=56)
    summary_text = summary[0]['summary_text']

    return summary_text