File size: 2,430 Bytes
5c9215b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from knowledgeassistant.exception.exception import KnowledgeAssistantException
from knowledgeassistant.logging.logger import logging

from knowledgeassistant.entity.config_entity import KeywordExtractionConfig
from knowledgeassistant.utils.main_utils.utils import read_txt_file, write_txt_file

import sys
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

class KeywordExtraction:
    def __init__(self, keyword_extraction_config: KeywordExtractionConfig):
        try:
            self.keyword_extraction_config = keyword_extraction_config
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)
    
    def extract_keywords(self, input_text_path: str, keywords_count: str):
        try:
            vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]+\b')
            transformer = TfidfTransformer()
            logging.info("Vectorizer and Transformer successfully setup")

            text = read_txt_file(file_path=input_text_path)
            word_count_matrix = vectorizer.fit_transform([text])
            tfidf_matrix = transformer.fit_transform(word_count_matrix)
            logging.info("Successfully calculated word count and their tfidf scores")

            feature_array = np.array([word for word in vectorizer.get_feature_names_out() if word.isalpha()])
            tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
            logging.info("Successfully extracted keywords and sorted in descending order of their tfidf scores")

            top_keywords = feature_array[tfidf_sorting][:keywords_count]
            content = "\n".join(top_keywords)
            write_txt_file(
                self.keyword_extraction_config.extracted_keywords_file_path,
                content,
                True
            )
            logging.info(f"Successfully extracted and wrote top {keywords_count} keywords from text")
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)
    
    def initiate_keyword_extraction(self, input_text_path: str, keywords_count: str):
        try:
            self.extract_keywords(
                input_text_path = input_text_path,
                keywords_count = keywords_count
            )
        except Exception as e:
            raise KnowledgeAssistantException(e, sys)