Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import torch
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
5 |
+
|
6 |
+
MODEL_NAME = "valhalla/distilbart-mnli-12-1"
|
7 |
+
device = 0 if torch.cuda.is_available() else -1
|
8 |
+
|
9 |
+
@st.cache_resource
|
10 |
+
def load_zero_shot_pipeline():
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
12 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
13 |
+
return pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)
|
14 |
+
|
15 |
+
zero_shot = load_zero_shot_pipeline()
|
16 |
+
|
17 |
+
st.title("ArticleInsight (Demo Offline Pipeline)")
|
18 |
+
|
19 |
+
st.markdown(\"\"\"
|
20 |
+
**Upload a CSV** with an 'Abstract' column. We'll run a simple 8-step analysis:
|
21 |
+
1. Empirical Study?
|
22 |
+
2. Identify Construct
|
23 |
+
3. Sample Details
|
24 |
+
4. Main Research Question
|
25 |
+
5. Key Findings
|
26 |
+
6. Variables (IV/DV)
|
27 |
+
7. Antecedents, Outcomes
|
28 |
+
8. Unit of Analysis
|
29 |
+
|
30 |
+
**Disclaimer**: This is a *very naive* demonstration using zero-shot classification and simple regex.
|
31 |
+
It won't be super accurate, but requires no coding from you!
|
32 |
+
\"\"\")
|
33 |
+
|
34 |
+
uploaded_file = st.file_uploader("Upload CSV with 'Abstract' column")
|
35 |
+
|
36 |
+
if uploaded_file:
|
37 |
+
df = pd.read_csv(uploaded_file)
|
38 |
+
if "Abstract" not in df.columns:
|
39 |
+
st.error("CSV must have an 'Abstract' column.")
|
40 |
+
st.stop()
|
41 |
+
|
42 |
+
st.success("File uploaded successfully!")
|
43 |
+
if st.button("Run Analysis"):
|
44 |
+
with st.spinner("Analyzing each abstract..."):
|
45 |
+
df["Empirical Study"] = ""
|
46 |
+
df["Construct"] = ""
|
47 |
+
df["Sample Details"] = ""
|
48 |
+
df["Research Question"] = ""
|
49 |
+
df["Key Findings"] = ""
|
50 |
+
df["Variables"] = ""
|
51 |
+
df["Antecedents"] = ""
|
52 |
+
df["Outcomes"] = ""
|
53 |
+
df["Unit of Analysis"] = ""
|
54 |
+
|
55 |
+
for i, row in df.iterrows():
|
56 |
+
abstract = str(row["Abstract"])
|
57 |
+
df.at[i, "Empirical Study"] = classify_empirical(abstract)
|
58 |
+
if df.at[i, "Empirical Study"] == "Yes":
|
59 |
+
df.at[i, "Construct"] = find_constructs(abstract)
|
60 |
+
df.at[i, "Sample Details"] = extract_sample_details(abstract)
|
61 |
+
df.at[i, "Research Question"] = guess_research_question(abstract)
|
62 |
+
df.at[i, "Key Findings"] = guess_key_findings(abstract)
|
63 |
+
var, ants, outs = identify_variables(abstract)
|
64 |
+
df.at[i, "Variables"] = var
|
65 |
+
df.at[i, "Antecedents"] = ants
|
66 |
+
df.at[i, "Outcomes"] = outs
|
67 |
+
df.at[i, "Unit of Analysis"] = identify_unit_of_analysis(abstract)
|
68 |
+
else:
|
69 |
+
for col in ["Construct", "Sample Details", "Research Question", "Key Findings", "Variables", "Antecedents", "Outcomes", "Unit of Analysis"]:
|
70 |
+
df.at[i, col] = "N/A"
|
71 |
+
|
72 |
+
st.success("Done!")
|
73 |
+
st.dataframe(df.head(50))
|
74 |
+
csv_data = df.to_csv(index=False).encode("utf-8")
|
75 |
+
st.download_button("Download Analyzed CSV", data=csv_data, file_name="analysis_output.csv", mime="text/csv")
|
76 |
+
|
77 |
+
|
78 |
+
# === Functions ===
|
79 |
+
|
80 |
+
def classify_empirical(text):
|
81 |
+
candidate_labels = ["empirical study", "theoretical paper"]
|
82 |
+
res = zero_shot(text, candidate_labels)
|
83 |
+
top_label = res["labels"][0]
|
84 |
+
top_score = res["scores"][0]
|
85 |
+
if top_label == "empirical study" and top_score > 0.5:
|
86 |
+
return "Yes"
|
87 |
+
elif top_label == "theoretical paper" and top_score > 0.5:
|
88 |
+
return "No"
|
89 |
+
return "Unknown"
|
90 |
+
|
91 |
+
def find_constructs(text):
|
92 |
+
tokens = text.lower().split()
|
93 |
+
freq = {}
|
94 |
+
for w in tokens:
|
95 |
+
if len(w) > 5 and w.isalpha():
|
96 |
+
freq[w] = freq.get(w, 0) + 1
|
97 |
+
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
98 |
+
if not sorted_freq:
|
99 |
+
return "Unknown"
|
100 |
+
return ", ".join([x[0] for x in sorted_freq[:2]])
|
101 |
+
|
102 |
+
def extract_sample_details(text):
|
103 |
+
import re
|
104 |
+
t = text.lower()
|
105 |
+
pattern = r"(n\s*=\s*\d+|sample of \d+|\d+\s+participants|\d+\s+subjects)"
|
106 |
+
matches = re.findall(pattern, t)
|
107 |
+
info = "; ".join([m[0] if isinstance(m, tuple) else m for m in matches]) if matches else ""
|
108 |
+
if "student" in t:
|
109 |
+
info += "; students"
|
110 |
+
if "employee" in t:
|
111 |
+
info += "; employees"
|
112 |
+
return info if info else "Unknown"
|
113 |
+
|
114 |
+
def guess_research_question(text):
|
115 |
+
lower = text.lower()
|
116 |
+
if "effect of" in lower:
|
117 |
+
idx = lower.index("effect of")
|
118 |
+
snippet = text[idx: idx+60]
|
119 |
+
return f"Does {snippet}?"
|
120 |
+
elif "aim of this study" in lower:
|
121 |
+
idx = lower.index("aim of this study")
|
122 |
+
snippet = text[idx: idx+60]
|
123 |
+
return snippet
|
124 |
+
return "Unknown"
|
125 |
+
|
126 |
+
def guess_key_findings(text):
|
127 |
+
lower = text.lower()
|
128 |
+
if "we find that" in lower:
|
129 |
+
idx = lower.index("we find that")
|
130 |
+
return text[idx: idx+100]
|
131 |
+
elif "results show" in lower:
|
132 |
+
idx = lower.index("results show")
|
133 |
+
return text[idx: idx+100]
|
134 |
+
return "Unknown"
|
135 |
+
|
136 |
+
def identify_variables(text):
|
137 |
+
import re
|
138 |
+
t = text.lower()
|
139 |
+
pattern = r"(impact|influence|effect) of (\w+) on (\w+)"
|
140 |
+
match = re.search(pattern, t)
|
141 |
+
if match:
|
142 |
+
iv = match.group(2)
|
143 |
+
dv = match.group(3)
|
144 |
+
return f"IV: {iv}, DV: {dv}", iv, dv
|
145 |
+
return "Unknown", "Unknown", "Unknown"
|
146 |
+
|
147 |
+
def identify_unit_of_analysis(text):
|
148 |
+
lower = text.lower()
|
149 |
+
if "team" in lower or "groups" in lower:
|
150 |
+
return "Team"
|
151 |
+
if "organization" in lower or "firm" in lower:
|
152 |
+
return "Organization"
|
153 |
+
if any(x in lower for x in ["participant", "individual", "student", "employee"]):
|
154 |
+
return "Individual"
|
155 |
+
return "Unknown"
|