Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,189 Bytes
15639f6 2a6efbf 15639f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import spacy
from collections import Counter
try:
nlp = spacy.load("ru_core_news_lg")
except OSError:
import spacy.cli
spacy.cli.download("ru_core_news_lg")
nlp = spacy.load("ru_core_news_lg")
def analyze_text(text):
doc = nlp(text)
tokens = [token.text for token in doc]
words = [token.text for token in doc if token.is_alpha]
unique_words = set(words)
stop_words = [token.text for token in doc if token.is_stop]
avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
pos_counts = Counter([token.pos_ for token in doc if token.is_alpha])
lemmas = [token.lemma_ for token in doc if token.is_alpha]
unique_lemmas = set(lemmas)
dependencies = Counter([token.dep_ for token in doc if token.dep_ != ""])
has_noun_chunks = False
try:
next(doc.noun_chunks, None)
has_noun_chunks = True
except NotImplementedError:
pass
entities = [(ent.text, ent.label_) for ent in doc.ents]
entity_counts = Counter([ent.label_ for ent in doc.ents])
ttr = len(unique_words) / len(words) if words else 0
def calculate_simplified_mtld(text_tokens, ttr_threshold=0.72):
if len(text_tokens) < 10:
return 0
segments = []
current_segment = []
for token in text_tokens:
current_segment.append(token)
current_ttr = len(set(current_segment)) / len(current_segment)
if current_ttr <= ttr_threshold and len(current_segment) >= 10:
segments.append(current_segment)
current_segment = []
if current_segment:
segments.append(current_segment)
if not segments:
return 0
return len(text_tokens) / len(segments)
mtld = calculate_simplified_mtld(words)
sentences = list(doc.sents)
sentence_lengths = [len(sent) for sent in sentences]
avg_sentence_length = sum(sentence_lengths) / len(sentences) if sentences else 0
words_per_sentence = len(words) / len(sentences) if sentences else 0
def count_syllables_ru(word):
return len([c for c in word.lower() if c in 'аеёиоуыэюя'])
syllables = sum(count_syllables_ru(word) for word in words)
syllables_per_word = syllables / len(words) if words else 0
flesh_kincaid = 206.835 - 1.3 * words_per_sentence - 60.1 * syllables_per_word
long_words = [word for word in words if count_syllables_ru(word) > 4]
long_words_percent = len(long_words) / len(words) * 100 if words else 0
sentence_count = len(sentences)
question_count = sum(1 for sent in sentences if sent.text.strip().endswith('?'))
exclamation_count = sum(1 for sent in sentences if sent.text.strip().endswith('!'))
coherence_scores = []
if len(sentences) > 1:
for i in range(len(sentences)-1):
if len(sentences[i]) > 0 and len(sentences[i+1]) > 0:
try:
if sentences[i].vector_norm > 0 and sentences[i+1].vector_norm > 0:
sim = sentences[i].similarity(sentences[i+1])
coherence_scores.append(sim)
except:
pass
avg_coherence = sum(coherence_scores) / len(coherence_scores) if coherence_scores else 0
analysis_results = {
'basic_stats': {
'total_tokens': len(tokens),
'total_words': len(words),
'unique_words': len(unique_words),
'stop_words': len(stop_words),
'avg_word_length': avg_word_length
},
'morphological_analysis': {
'pos_distribution': {pos: count for pos, count in pos_counts.items()},
'unique_lemmas': len(unique_lemmas),
'lemma_word_ratio': len(unique_lemmas) / len(unique_words) if unique_words else 0
},
'syntactic_analysis': {
'dependencies': {dep: count for dep, count in dependencies.most_common(10)},
'noun_chunks': has_noun_chunks
},
'named_entities': {
'total_entities': len(entities),
'entity_types': {label: count for label, count in entity_counts.items()}
},
'lexical_diversity': {
'ttr': ttr,
'mtld': mtld
},
'text_structure': {
'sentence_count': sentence_count,
'avg_sentence_length': avg_sentence_length,
'question_sentences': question_count,
'exclamation_sentences': exclamation_count
},
'readability': {
'words_per_sentence': words_per_sentence,
'syllables_per_word': syllables_per_word,
'flesh_kincaid_score': flesh_kincaid,
'long_words_percent': long_words_percent
},
'semantic_coherence': {
'avg_coherence_score': avg_coherence
}
}
return analysis_results
def show_text_analysis(analysis):
print("\n📊 TEXT ANALYSIS")
print("\n=== BASIC STATISTICS ===")
print(f"- Total tokens: {analysis['basic_stats']['total_tokens']}")
print(f"- Total words: {analysis['basic_stats']['total_words']}")
print(f"- Unique words: {analysis['basic_stats']['unique_words']}")
print(f"- Stop words: {analysis['basic_stats']['stop_words']}")
print(f"- Average word length: {analysis['basic_stats']['avg_word_length']:.2f} characters")
print("\n=== MORPHOLOGICAL ANALYSIS ===")
print("- POS distribution:")
for pos, count in analysis['morphological_analysis']['pos_distribution'].items():
print(f" • {pos}: {count}")
print(f"- Unique lemmas: {analysis['morphological_analysis']['unique_lemmas']}")
print("\n=== SYNTACTIC ANALYSIS ===")
print("- Syntactic dependencies (top-5):")
for i, (dep, count) in enumerate(analysis['syntactic_analysis']['dependencies'].items()):
if i >= 5:
break
print(f" • {dep}: {count}")
print("\n=== NAMED ENTITIES ===")
print(f"- Total entities: {analysis['named_entities']['total_entities']}")
print("\n=== LEXICAL DIVERSITY ===")
print(f"- TTR (type-token ratio): {analysis['lexical_diversity']['ttr']:.3f}")
print(f"- MTLD (simplified): {analysis['lexical_diversity']['mtld']:.2f}")
print("\n=== TEXT STRUCTURE ===")
print(f"- Sentence count: {analysis['text_structure']['sentence_count']}")
print(f"- Average sentence length: {analysis['text_structure']['avg_sentence_length']:.2f} tokens")
print("\n=== READABILITY ===")
print(f"- Flesch-Kincaid score: {analysis['readability']['flesh_kincaid_score']:.2f}")
print(f"- Long words percentage: {analysis['readability']['long_words_percent']:.2f}%")
print(f"\n=== SEMANTIC COHERENCE ===")
print(f"- Average coherence between sentences: {analysis['semantic_coherence']['avg_coherence_score']:.3f}") |