Spaces:
Running
on
Zero
Running
on
Zero
import pandas as pd | |
from text_analysis import analyze_text | |
def extract_features(text, feature_config=None, scores=None): | |
if feature_config is None: | |
feature_config = { | |
'basic_scores': True, | |
'basic_text_stats': ['total_tokens', 'total_words', 'unique_words', 'stop_words', 'avg_word_length'], | |
'morphological': ['pos_distribution', 'unique_lemmas', 'lemma_word_ratio'], | |
'syntactic': ['dependencies', 'noun_chunks'], | |
'entities': ['total_entities', 'entity_types'], | |
'diversity': ['ttr', 'mtld'], | |
'structure': ['sentence_count', 'avg_sentence_length', 'question_sentences', 'exclamation_sentences'], | |
'readability': ['words_per_sentence', 'syllables_per_word', 'flesh_kincaid_score', 'long_words_percent'], | |
'semantic': True | |
} | |
text_analysis = analyze_text(text) | |
features_df = pd.DataFrame(index=[0]) | |
if scores: | |
features_df['score_chat'] = scores.get('score_chat', 0) | |
features_df['score_coder'] = scores.get('score_coder', 0) | |
else: | |
features_df['score_chat'] = 0 | |
features_df['score_coder'] = 0 | |
print("Warning: No scores provided, using zeros for score_chat and score_coder") | |
if feature_config.get('basic_text_stats'): | |
for feature in feature_config['basic_text_stats']: | |
features_df[f'basic_{feature}'] = text_analysis.get('basic_stats', {}).get(feature, 0) | |
if feature_config.get('morphological'): | |
for feature in feature_config['morphological']: | |
if feature == 'pos_distribution': | |
pos_types = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'DET', 'ADP', 'PRON', 'CCONJ', 'SCONJ'] | |
for pos in pos_types: | |
features_df[f'pos_{pos}'] = text_analysis.get('morphological_analysis', {}).get('pos_distribution', {}).get(pos, 0) | |
else: | |
features_df[f'morph_{feature}'] = text_analysis.get('morphological_analysis', {}).get(feature, 0) | |
if feature_config.get('syntactic'): | |
for feature in feature_config['syntactic']: | |
if feature == 'dependencies': | |
dep_types = ['nsubj', 'obj', 'amod', 'nmod', 'ROOT', 'punct', 'case'] | |
for dep in dep_types: | |
features_df[f'dep_{dep}'] = text_analysis.get('syntactic_analysis', {}).get('dependencies', {}).get(dep, 0) | |
else: | |
features_df[f'synt_{feature}'] = text_analysis.get('syntactic_analysis', {}).get(feature, 0) | |
if feature_config.get('entities'): | |
for feature in feature_config['entities']: | |
if feature == 'entity_types': | |
entity_types = ['PER', 'LOC', 'ORG'] | |
for ent in entity_types: | |
features_df[f'ent_{ent}'] = text_analysis.get('named_entities', {}).get('entity_types', {}).get(ent, 0) | |
else: | |
features_df[f'ent_{feature}'] = text_analysis.get('named_entities', {}).get(feature, 0) | |
if feature_config.get('diversity'): | |
for feature in feature_config['diversity']: | |
features_df[f'div_{feature}'] = text_analysis.get('lexical_diversity', {}).get(feature, 0) | |
if feature_config.get('structure'): | |
for feature in feature_config['structure']: | |
features_df[f'struct_{feature}'] = text_analysis.get('text_structure', {}).get(feature, 0) | |
if feature_config.get('readability'): | |
for feature in feature_config['readability']: | |
features_df[f'read_{feature}'] = text_analysis.get('readability', {}).get(feature, 0) | |
if feature_config.get('semantic'): | |
features_df['semantic_coherence'] = text_analysis.get('semantic_coherence', {}).get('avg_coherence_score', 0) | |
return features_df, text_analysis |