import streamlit as st import numpy as np import nltk from nltk import pos_tag, word_tokenize from sklearn_crfsuite import CRF from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_20newsgroups from collections import Counter import random # Download necessary NLTK data nltk.download('punkt') nltk.download('averaged_perceptron_tagger') # Page title with emoji st.title('πŸ“– Statistical NLP πŸ“–') # Language Models: n-grams and smoothing techniques st.markdown('

1️⃣ Language Models

', unsafe_allow_html=True) st.markdown('### πŸ“– Definition:') st.write(""" Language models are statistical models that assign probabilities to sequences of words. They help us predict the next word in a sentence or assess the likelihood of a sentence. - **n-grams**: A language model based on n-grams uses sequences of 'n' consecutive words to predict the next word. - **Smoothing Techniques**: Smoothing techniques like Kneser-Ney are used to handle cases where a particular n-gram has not been seen in the training data, thus preventing zero probabilities. """) # Interactive example for n-grams st.markdown('### 🎯 n-gram Model Example:') ngram_input = st.text_area("✍️ Enter a sentence to see n-grams", "I love programming in Python") n = st.slider("πŸ”’ Choose n for n-grams:", 1, 4, 2) if st.button('✨ Generate n-grams'): tokens = word_tokenize(ngram_input.lower()) ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)] st.success(f"{n}-grams in the input sentence: {ngrams}") # Perplexity st.markdown('

2️⃣ Perplexity

', unsafe_allow_html=True) st.markdown('### πŸ“– Definition:') st.write(""" Perplexity is a measurement of how well a probability model predicts a sample. It’s often used to evaluate language models. A lower perplexity means the model is better at predicting the next word. It’s calculated as the inverse probability of the test set normalized by the number of words. """) # Interactive example for calculating Perplexity st.markdown('### πŸ“Š Perplexity Calculation Example:') sample_sentence = st.text_area("✍️ Enter sentence to calculate perplexity", "This is an example sentence") if st.button('πŸ“‰ Calculate Perplexity'): # Simple approximation of perplexity for demonstration word_count = len(sample_sentence.split()) word_freq = Counter(sample_sentence.split()) vocab_size = len(word_freq) perplexity = np.exp(sum(-np.log(word_freq[word] / word_count) for word in sample_sentence.split()) / word_count) st.success(f"Perplexity of the sentence: {perplexity:.2f}") # Hidden Markov Models (HMM): Part-of-Speech (POS) tagging st.markdown('

3️⃣ Hidden Markov Models (HMM)

', unsafe_allow_html=True) st.markdown('### πŸ“– Definition:') st.write(""" Hidden Markov Models (HMMs) are statistical models that represent sequences of observations with underlying hidden states. In NLP, they are commonly used for tasks like Part-of-Speech (POS) tagging, where each word is assigned a grammatical category (e.g., noun, verb). """) # Interactive POS Tagging example using HMM st.markdown('### πŸ” POS Tagging with HMM:') sentence_to_tag = st.text_area("✍️ Enter sentence for POS tagging", "I love programming in Python") if st.button('πŸ“ Tag POS'): tokens = word_tokenize(sentence_to_tag) tagged = pos_tag(tokens) st.success(f"POS Tagging result: {tagged}") # Conditional Random Fields (CRF): Sequence labeling tasks like NER st.markdown('

4️⃣ Conditional Random Fields (CRF)

', unsafe_allow_html=True) st.markdown('### πŸ“– Definition:') st.write(""" Conditional Random Fields (CRF) are used for sequence labeling tasks, where each element in the sequence is assigned a label. They are particularly useful for tasks like Named Entity Recognition (NER), where we want to identify entities such as people, organizations, or locations in text. """) # Sample data for Named Entity Recognition (NER) using CRF st.markdown('### 🏷️ NER Example using CRF:') sample_sentence = st.text_area("✍️ Enter sentence for NER (Named Entity Recognition)", "Barack Obama was born in Hawaii.") # Sample NER prediction using a simple CRF model (a toy example) ner_examples = [ (["Barack", "Obama", "was", "born", "in", "Hawaii"], ["B-PER", "I-PER", "O", "O", "O", "B-LOC"]), (["Apple", "is", "based", "in", "California"], ["B-ORG", "O", "O", "O", "B-LOC"]) ] # Training a simple CRF model with toy data X_train = [x[0] for x in ner_examples] y_train = [x[1] for x in ner_examples] # Initialize CRF crf = CRF(algorithm='lbfgs') crf.fit(X_train, y_train) # Simple prediction for demonstration if st.button('πŸ”Ž Perform NER'): tokens = word_tokenize(sample_sentence) predicted_tags = crf.predict([tokens])[0] entities = [(tokens[i], predicted_tags[i]) for i in range(len(tokens))] st.success(f"NER result: {entities}")