Spaces:
Runtime error
Runtime error
File size: 5,653 Bytes
49a6a82 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from symspellpy import SymSpell, Verbosity
import gradio as gr
# Ensure Hugging Face cache directory is writable
os.environ["TRANSFORMERS_CACHE"] = "/home/user/.cache/huggingface"
# Set device
device = torch.device("cpu")
# Define DiseaseClassifier Model
class DiseaseClassifier(nn.Module):
def __init__(self, input_size, num_classes, dropout_rate=0.35665610394511454):
super(DiseaseClassifier, self).__init__()
self.fc1 = nn.Linear(input_size, 382)
self.fc2 = nn.Linear(382, 389)
self.fc3 = nn.Linear(389, 433)
self.fc4 = nn.Linear(433, num_classes)
self.activation = nn.LeakyReLU()
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
x = self.activation(self.fc1(x))
x = self.dropout(x)
x = self.activation(self.fc2(x))
x = self.dropout(x)
x = self.activation(self.fc3(x))
x = self.dropout(x)
x = self.fc4(x) # Logits
return x
# Define DiseasePredictionModel
class DiseasePredictionModel:
def __init__(self, ai_model_name="model.pth", data_file="data.csv", symptom_json="symptoms.json", dictionary_file="frequency_dictionary_en_82_765.txt"):
# Load dataset
self.df = pd.read_csv(data_file)
self.symptom_columns = self.load_symptoms(symptom_json)
self.label_encoder = LabelEncoder()
self.label_encoder.fit(self.df.iloc[:, 0])
self.scaler = StandardScaler()
self.scaler.fit(self.df.iloc[:, 1:].values)
self.input_size = len(self.symptom_columns)
self.num_classes = len(self.label_encoder.classes_)
self.model = self._load_model(ai_model_name)
self.SYMPTOM_LIST = self.load_symptoms(symptom_json)
# Load SymSpell dictionary
self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
self.sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1)
# Load BioBERT tokenizer and model
self.tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
self.nlp_model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-v1.1")
self.ner_pipeline = pipeline("ner", model=self.nlp_model, tokenizer=self.tokenizer, aggregation_strategy="simple")
# Load Sentence Transformer
self.semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def _load_model(self, ai_model_name):
model = DiseaseClassifier(self.input_size, self.num_classes).to(device)
model.load_state_dict(torch.load(ai_model_name, map_location=device))
model.eval()
return model
def predict_disease(self, symptoms):
input_vector = np.zeros(len(self.symptom_columns))
for symptom in symptoms:
if symptom in self.symptom_columns:
input_vector[list(self.symptom_columns).index(symptom)] = 1
input_vector = self.scaler.transform([input_vector])
input_tensor = torch.tensor(input_vector, dtype=torch.float32).to(device)
with torch.no_grad():
outputs = self.model(input_tensor)
_, predicted_class = torch.max(outputs, 1)
predicted_disease = self.label_encoder.inverse_transform([predicted_class.cpu().numpy()[0]])[0]
return predicted_disease
def load_symptoms(self, json_file):
with open(json_file, "r", encoding="utf-8") as f:
return json.load(f)
def correct_text(self, text):
words = text.split()
corrected_words = []
for word in words:
if word.lower() in [symptom.lower() for symptom in self.SYMPTOM_LIST]:
corrected_words.append(word)
else:
suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
corrected_words.append(suggestions[0].term)
else:
corrected_words.append(word)
return ' '.join(corrected_words)
def extract_symptoms(self, text):
ner_results = self.ner_pipeline(text)
symptoms = set()
for entity in ner_results:
if entity["entity_group"] == "DISEASE":
symptoms.add(entity["word"].lower())
return list(symptoms)
def match_symptoms(self, extracted_symptoms):
matched = {}
symptom_embeddings = self.semantic_model.encode(self.SYMPTOM_LIST, convert_to_tensor=True)
for symptom in extracted_symptoms:
symptom_embedding = self.semantic_model.encode(symptom, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(symptom_embedding, symptom_embeddings)[0]
most_similar_idx = similarities.argmax()
best_match = self.SYMPTOM_LIST[most_similar_idx]
matched[symptom] = best_match
return matched.values()
# Initialize Model
model = DiseasePredictionModel()
# Define Prediction Function
def predict(symptoms):
corrected = model.correct_text(symptoms)
extracted = model.extract_symptoms(corrected)
matched = model.match_symptoms(extracted)
prediction = model.predict_disease(matched)
return prediction
# Define Gradio Interface
iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="Disease Prediction AI")
iface.launch()
|