Spaces:
Sleeping
Sleeping
File size: 5,679 Bytes
38ba8f6 6555a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import streamlit as st
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
MODEL_NAME = "valhalla/distilbart-mnli-12-1"
device = 0 if torch.cuda.is_available() else -1
@st.cache_resource
def load_zero_shot_pipeline():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
return pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)
zero_shot = load_zero_shot_pipeline()
st.title("ArticleInsight (Demo Offline Pipeline)")
st.markdown(\"\"\"
**Upload a CSV** with an 'Abstract' column. We'll run a simple 8-step analysis:
1. Empirical Study?
2. Identify Construct
3. Sample Details
4. Main Research Question
5. Key Findings
6. Variables (IV/DV)
7. Antecedents, Outcomes
8. Unit of Analysis
**Disclaimer**: This is a *very naive* demonstration using zero-shot classification and simple regex.
It won't be super accurate, but requires no coding from you!
\"\"\")
uploaded_file = st.file_uploader("Upload CSV with 'Abstract' column")
if uploaded_file:
df = pd.read_csv(uploaded_file)
if "Abstract" not in df.columns:
st.error("CSV must have an 'Abstract' column.")
st.stop()
st.success("File uploaded successfully!")
if st.button("Run Analysis"):
with st.spinner("Analyzing each abstract..."):
df["Empirical Study"] = ""
df["Construct"] = ""
df["Sample Details"] = ""
df["Research Question"] = ""
df["Key Findings"] = ""
df["Variables"] = ""
df["Antecedents"] = ""
df["Outcomes"] = ""
df["Unit of Analysis"] = ""
for i, row in df.iterrows():
abstract = str(row["Abstract"])
df.at[i, "Empirical Study"] = classify_empirical(abstract)
if df.at[i, "Empirical Study"] == "Yes":
df.at[i, "Construct"] = find_constructs(abstract)
df.at[i, "Sample Details"] = extract_sample_details(abstract)
df.at[i, "Research Question"] = guess_research_question(abstract)
df.at[i, "Key Findings"] = guess_key_findings(abstract)
var, ants, outs = identify_variables(abstract)
df.at[i, "Variables"] = var
df.at[i, "Antecedents"] = ants
df.at[i, "Outcomes"] = outs
df.at[i, "Unit of Analysis"] = identify_unit_of_analysis(abstract)
else:
for col in ["Construct", "Sample Details", "Research Question", "Key Findings", "Variables", "Antecedents", "Outcomes", "Unit of Analysis"]:
df.at[i, col] = "N/A"
st.success("Done!")
st.dataframe(df.head(50))
csv_data = df.to_csv(index=False).encode("utf-8")
st.download_button("Download Analyzed CSV", data=csv_data, file_name="analysis_output.csv", mime="text/csv")
# === Functions ===
def classify_empirical(text):
candidate_labels = ["empirical study", "theoretical paper"]
res = zero_shot(text, candidate_labels)
top_label = res["labels"][0]
top_score = res["scores"][0]
if top_label == "empirical study" and top_score > 0.5:
return "Yes"
elif top_label == "theoretical paper" and top_score > 0.5:
return "No"
return "Unknown"
def find_constructs(text):
tokens = text.lower().split()
freq = {}
for w in tokens:
if len(w) > 5 and w.isalpha():
freq[w] = freq.get(w, 0) + 1
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
if not sorted_freq:
return "Unknown"
return ", ".join([x[0] for x in sorted_freq[:2]])
def extract_sample_details(text):
import re
t = text.lower()
pattern = r"(n\s*=\s*\d+|sample of \d+|\d+\s+participants|\d+\s+subjects)"
matches = re.findall(pattern, t)
info = "; ".join([m[0] if isinstance(m, tuple) else m for m in matches]) if matches else ""
if "student" in t:
info += "; students"
if "employee" in t:
info += "; employees"
return info if info else "Unknown"
def guess_research_question(text):
lower = text.lower()
if "effect of" in lower:
idx = lower.index("effect of")
snippet = text[idx: idx+60]
return f"Does {snippet}?"
elif "aim of this study" in lower:
idx = lower.index("aim of this study")
snippet = text[idx: idx+60]
return snippet
return "Unknown"
def guess_key_findings(text):
lower = text.lower()
if "we find that" in lower:
idx = lower.index("we find that")
return text[idx: idx+100]
elif "results show" in lower:
idx = lower.index("results show")
return text[idx: idx+100]
return "Unknown"
def identify_variables(text):
import re
t = text.lower()
pattern = r"(impact|influence|effect) of (\w+) on (\w+)"
match = re.search(pattern, t)
if match:
iv = match.group(2)
dv = match.group(3)
return f"IV: {iv}, DV: {dv}", iv, dv
return "Unknown", "Unknown", "Unknown"
def identify_unit_of_analysis(text):
lower = text.lower()
if "team" in lower or "groups" in lower:
return "Team"
if "organization" in lower or "firm" in lower:
return "Organization"
if any(x in lower for x in ["participant", "individual", "student", "employee"]):
return "Individual"
return "Unknown"
import os
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
os.environ["STREAMLIT_SERVER_ADDRESS"] = "0.0.0.0"
|