Spaces:
Running
Running
File size: 2,430 Bytes
5c9215b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from knowledgeassistant.exception.exception import KnowledgeAssistantException
from knowledgeassistant.logging.logger import logging
from knowledgeassistant.entity.config_entity import KeywordExtractionConfig
from knowledgeassistant.utils.main_utils.utils import read_txt_file, write_txt_file
import sys
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
class KeywordExtraction:
def __init__(self, keyword_extraction_config: KeywordExtractionConfig):
try:
self.keyword_extraction_config = keyword_extraction_config
except Exception as e:
raise KnowledgeAssistantException(e, sys)
def extract_keywords(self, input_text_path: str, keywords_count: str):
try:
vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]+\b')
transformer = TfidfTransformer()
logging.info("Vectorizer and Transformer successfully setup")
text = read_txt_file(file_path=input_text_path)
word_count_matrix = vectorizer.fit_transform([text])
tfidf_matrix = transformer.fit_transform(word_count_matrix)
logging.info("Successfully calculated word count and their tfidf scores")
feature_array = np.array([word for word in vectorizer.get_feature_names_out() if word.isalpha()])
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
logging.info("Successfully extracted keywords and sorted in descending order of their tfidf scores")
top_keywords = feature_array[tfidf_sorting][:keywords_count]
content = "\n".join(top_keywords)
write_txt_file(
self.keyword_extraction_config.extracted_keywords_file_path,
content,
True
)
logging.info(f"Successfully extracted and wrote top {keywords_count} keywords from text")
except Exception as e:
raise KnowledgeAssistantException(e, sys)
def initiate_keyword_extraction(self, input_text_path: str, keywords_count: str):
try:
self.extract_keywords(
input_text_path = input_text_path,
keywords_count = keywords_count
)
except Exception as e:
raise KnowledgeAssistantException(e, sys) |