File size: 2,608 Bytes
c965e8a
 
 
 
 
 
 
 
08b129e
c965e8a
 
 
 
 
 
 
 
 
 
14f8a5c
 
 
 
 
c965e8a
14f8a5c
c965e8a
14f8a5c
 
 
 
 
 
 
 
 
 
 
 
 
c965e8a
14f8a5c
 
c965e8a
 
14f8a5c
 
 
 
 
 
 
c965e8a
 
14f8a5c
c965e8a
 
 
 
 
 
 
 
5c9215b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from knowledgeassistant.exception.exception import KnowledgeAssistantException
from knowledgeassistant.logging.logger import logging

from knowledgeassistant.entity.config_entity import DataSummarizationConfig
from knowledgeassistant.utils.main_utils.utils import write_txt_file, read_txt_file

import sys
import torch
from transformers import pipeline, AutoTokenizer

class DataSummarization:
    def __init__(self, data_summarization_config: DataSummarizationConfig):
        try:
            self.data_summarization_config = data_summarization_config
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)

    def summarize(self, input_text_path: str, min_length: int):
        try:
            model_path = "/app/models/bart-large-cnn"
            
            tokenizer = AutoTokenizer.from_pretrained(model_path)
    
            pipe = pipeline("summarization", model=model_path, tokenizer=model_path)
            logging.info("Summarization Pipeline Successfully Setup")
    
            text = read_txt_file(input_text_path)
    
            tokens = tokenizer.encode(text, truncation=True, max_length=1024, return_tensors="pt")
    
            if len(tokens[0]) >= 1024:
                logging.warning("Input text exceeded 1024 tokens. It has been truncated.")
                truncated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
                frontend_message = "Your input text exceeded the limit of 1024 tokens and has been truncated."
            else:
                truncated_text = text
                frontend_message = ""
    
            # Generate summary
            summary = pipe(truncated_text, min_length=min_length, max_length=142, do_sample=False)
            logging.info("Text successfully summarized")
    
            # Save summary
            write_txt_file(self.data_summarization_config.summarized_text_file_path, summary[0].get("summary_text"))
            logging.info("Successfully wrote summarized text")
    
            # Return summary along with frontend message
            return {
                "summary": summary[0].get("summary_text"),
                "warning": frontend_message
            }
    
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)

    
    def initiate_data_summarization(self, input_text_path: str, min_length: int):
        try:
            self.summarize(
                input_text_path = input_text_path,
                min_length = min_length
            )
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)