from knowledgeassistant.exception.exception import KnowledgeAssistantException from knowledgeassistant.logging.logger import logging from knowledgeassistant.entity.config_entity import KeywordExtractionConfig from knowledgeassistant.utils.main_utils.utils import read_txt_file, write_txt_file import sys import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer class KeywordExtraction: def __init__(self, keyword_extraction_config: KeywordExtractionConfig): try: self.keyword_extraction_config = keyword_extraction_config except Exception as e: raise KnowledgeAssistantException(e, sys) def extract_keywords(self, input_text_path: str, keywords_count: str): try: vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]+\b') transformer = TfidfTransformer() logging.info("Vectorizer and Transformer successfully setup") text = read_txt_file(file_path=input_text_path) word_count_matrix = vectorizer.fit_transform([text]) tfidf_matrix = transformer.fit_transform(word_count_matrix) logging.info("Successfully calculated word count and their tfidf scores") feature_array = np.array([word for word in vectorizer.get_feature_names_out() if word.isalpha()]) tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1] logging.info("Successfully extracted keywords and sorted in descending order of their tfidf scores") top_keywords = feature_array[tfidf_sorting][:keywords_count] content = "\n".join(top_keywords) write_txt_file( self.keyword_extraction_config.extracted_keywords_file_path, content, True ) logging.info(f"Successfully extracted and wrote top {keywords_count} keywords from text") except Exception as e: raise KnowledgeAssistantException(e, sys) def initiate_keyword_extraction(self, input_text_path: str, keywords_count: str): try: self.extract_keywords( input_text_path = input_text_path, keywords_count = keywords_count ) except Exception as e: raise KnowledgeAssistantException(e, sys)