Spaces:

barser65
/

assessment3

Build error

File size: 5,510 Bytes

def converti(path):
    import pip
    
    def install(package):
        if hasattr(pip, 'main'):
            pip.main(['install', package])
        else:
            pip._internal.main(['install', package])
    
    install('git+https://github.com/huggingface/transformers.git')
    install('datasets sentencepiece')
    install('PyPDF2')
    install('pdfminer.six')
    install('pdfplumber')
    install('poppler-utils')
    install('tesseract-ocr')
    install('libtesseract-dev')
    
    # To read the PDF
    import PyPDF2
    # To analyze the PDF layout and extract text
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    # To extract text from tables in PDF
    import pdfplumber
    # To remove the additional created files
    import os
    
    # Create a function to extract text
    
    def text_extraction(element):
        # Extracting the text from the in-line text element
        line_text = element.get_text()
    
        # Find the formats of the text
        # Initialize the list with all the formats that appeared in the line of text
        line_formats = []
        for text_line in element:
            if isinstance(text_line, LTTextContainer):
                # Iterating through each character in the line of text
                for character in text_line:
                    if isinstance(character, LTChar):
                        # Append the font name of the character
                        line_formats.append(character.fontname)
                        # Append the font size of the character
                        line_formats.append(character.size)
        # Find the unique font sizes and names in the line
        format_per_line = list(set(line_formats))
    
        # Return a tuple with the text in each line along with its format
        return (line_text, format_per_line)
    
    def read_pdf(pdf_path):
      # create a PDF file object
      pdfFileObj = open(pdf_path, 'rb')
      # create a PDF reader object
      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
    
      # Create the dictionary to extract text from each image
      text_per_page = {}
      # We extract the pages from the PDF
      for pagenum, page in enumerate(extract_pages(pdf_path)):
          print("Elaborating Page_" +str(pagenum))
          # Initialize the variables needed for the text extraction from the page
          pageObj = pdfReaded.pages[pagenum]
          page_text = []
          line_format = []
          text_from_images = []
          text_from_tables = []
          page_content = []
          # Initialize the number of the examined tables
          table_num = 0
          first_element= True
          table_extraction_flag= False
          # Open the pdf file
          pdf = pdfplumber.open(pdf_path)
          # Find the examined page
          page_tables = pdf.pages[pagenum]
          # Find the number of tables on the page
          tables = page_tables.find_tables()
    
    
          # Find all the elements
          page_elements = [(element.y1, element) for element in page._objs]
          # Sort all the elements as they appear in the page
          page_elements.sort(key=lambda a: a[0], reverse=True)
    
          # Find the elements that composed a page
          for i,component in enumerate(page_elements):
              # Extract the position of the top side of the element in the PDF
              pos= component[0]
              # Extract the element of the page layout
              element = component[1]
    
              # Check if the element is a text element
              if isinstance(element, LTTextContainer):
                  # Check if the text appeared in a table
                  if table_extraction_flag == False:
                      # Use the function to extract the text and format for each text element
                      (line_text, format_per_line) = text_extraction(element)
                      # Append the text of each line to the page text
                      page_text.append(line_text)
                      # Append the format for each line containing text
                      line_format.append(format_per_line)
                      page_content.append(line_text)
                  else:
                      # Omit the text that appeared in a table
                      pass
    
          # Create the key of the dictionary
          dctkey = 'Page_'+str(pagenum)
          # Add the list of list as the value of the page key
          text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
    
      # Closing the pdf file object
      pdfFileObj.close()
    
      return text_per_page
    
    from google.colab import drive
    drive.mount('/content/drive')
    
    pdf_path = '/content/drive/MyDrive/' + path
    text_per_page = read_pdf(pdf_path)
    
    abstr = ''
    while len(abstr) == 0:
      for par in range(len(text_per_page)):
        for x in text_per_page['Page_'+str(par)]:
          mystring = ' '.join(map(str,x))
          if mystring.find('Abstract\n') > 0:
            abstr0 = mystring[mystring.find('Abstract\n')+10:]
            abstr = abstr0[:abstr0.find('1\n')]
    print(abstr)
    
    from transformers import pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(abstr, max_length=56)
    summary_text = summary[0]['summary_text']

    return summary_text