proKBD commited on
Commit
285f8a6
·
verified ·
1 Parent(s): 821b188

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +58 -0
utils.py CHANGED
@@ -1035,6 +1035,64 @@ class SentimentAnalyzer:
1035
  print(f"Error extracting sentiment targets: {str(e)}")
1036
  return []
1037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  class TextToSpeechConverter:
1039
  def __init__(self):
1040
  self.output_dir = AUDIO_OUTPUT_DIR
 
1035
  print(f"Error extracting sentiment targets: {str(e)}")
1036
  return []
1037
 
1038
+ class TextSummarizer:
1039
+ def __init__(self):
1040
+ try:
1041
+ # Initialize the summarization pipeline
1042
+ self.summarizer = pipeline("summarization", model=SUMMARIZATION_MODEL)
1043
+ except Exception as e:
1044
+ print(f"Error initializing TextSummarizer: {str(e)}")
1045
+ # Fallback to default model if specific model fails
1046
+ self.summarizer = pipeline("summarization")
1047
+
1048
+ def summarize(self, text: str) -> str:
1049
+ """Generate a concise summary of the text."""
1050
+ try:
1051
+ # Clean and prepare text
1052
+ text = text.replace('\n', ' ').strip()
1053
+
1054
+ # Split text into chunks if it's too long
1055
+ chunks = self._split_text(text)
1056
+
1057
+ summaries = []
1058
+ for chunk in chunks:
1059
+ # Generate summary for each chunk
1060
+ summary = self.summarizer(chunk,
1061
+ max_length=130,
1062
+ min_length=30,
1063
+ do_sample=False)[0]['summary_text']
1064
+ summaries.append(summary)
1065
+
1066
+ # Combine summaries if there were multiple chunks
1067
+ final_summary = ' '.join(summaries)
1068
+ return final_summary
1069
+
1070
+ except Exception as e:
1071
+ print(f"Error generating summary: {str(e)}")
1072
+ return text[:200] + '...' # Return truncated text as fallback
1073
+
1074
+ def _split_text(self, text: str, max_length: int = 1024) -> List[str]:
1075
+ """Split text into chunks that fit within model's maximum token limit."""
1076
+ words = text.split()
1077
+ chunks = []
1078
+ current_chunk = []
1079
+ current_length = 0
1080
+
1081
+ for word in words:
1082
+ word_length = len(word) + 1 # +1 for space
1083
+ if current_length + word_length > max_length:
1084
+ chunks.append(' '.join(current_chunk))
1085
+ current_chunk = [word]
1086
+ current_length = word_length
1087
+ else:
1088
+ current_chunk.append(word)
1089
+ current_length += word_length
1090
+
1091
+ if current_chunk:
1092
+ chunks.append(' '.join(current_chunk))
1093
+
1094
+ return chunks
1095
+
1096
  class TextToSpeechConverter:
1097
  def __init__(self):
1098
  self.output_dir = AUDIO_OUTPUT_DIR