Spaces:
Running
Running
from knowledgeassistant.exception.exception import KnowledgeAssistantException | |
from knowledgeassistant.logging.logger import logging | |
from knowledgeassistant.entity.config_entity import KeywordExtractionConfig | |
from knowledgeassistant.utils.main_utils.utils import read_txt_file, write_txt_file | |
import sys | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
class KeywordExtraction: | |
def __init__(self, keyword_extraction_config: KeywordExtractionConfig): | |
try: | |
self.keyword_extraction_config = keyword_extraction_config | |
except Exception as e: | |
raise KnowledgeAssistantException(e, sys) | |
def extract_keywords(self, input_text_path: str, keywords_count: str): | |
try: | |
vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]+\b') | |
transformer = TfidfTransformer() | |
logging.info("Vectorizer and Transformer successfully setup") | |
text = read_txt_file(file_path=input_text_path) | |
word_count_matrix = vectorizer.fit_transform([text]) | |
tfidf_matrix = transformer.fit_transform(word_count_matrix) | |
logging.info("Successfully calculated word count and their tfidf scores") | |
feature_array = np.array([word for word in vectorizer.get_feature_names_out() if word.isalpha()]) | |
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1] | |
logging.info("Successfully extracted keywords and sorted in descending order of their tfidf scores") | |
top_keywords = feature_array[tfidf_sorting][:keywords_count] | |
content = "\n".join(top_keywords) | |
write_txt_file( | |
self.keyword_extraction_config.extracted_keywords_file_path, | |
content, | |
True | |
) | |
logging.info(f"Successfully extracted and wrote top {keywords_count} keywords from text") | |
except Exception as e: | |
raise KnowledgeAssistantException(e, sys) | |
def initiate_keyword_extraction(self, input_text_path: str, keywords_count: str): | |
try: | |
self.extract_keywords( | |
input_text_path = input_text_path, | |
keywords_count = keywords_count | |
) | |
except Exception as e: | |
raise KnowledgeAssistantException(e, sys) |