shashwatashish commited on
Commit
38ba8f6
·
verified ·
1 Parent(s): 1f94b6a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
5
+
6
+ MODEL_NAME = "valhalla/distilbart-mnli-12-1"
7
+ device = 0 if torch.cuda.is_available() else -1
8
+
9
+ @st.cache_resource
10
+ def load_zero_shot_pipeline():
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
13
+ return pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)
14
+
15
+ zero_shot = load_zero_shot_pipeline()
16
+
17
+ st.title("ArticleInsight (Demo Offline Pipeline)")
18
+
19
+ st.markdown(\"\"\"
20
+ **Upload a CSV** with an 'Abstract' column. We'll run a simple 8-step analysis:
21
+ 1. Empirical Study?
22
+ 2. Identify Construct
23
+ 3. Sample Details
24
+ 4. Main Research Question
25
+ 5. Key Findings
26
+ 6. Variables (IV/DV)
27
+ 7. Antecedents, Outcomes
28
+ 8. Unit of Analysis
29
+
30
+ **Disclaimer**: This is a *very naive* demonstration using zero-shot classification and simple regex.
31
+ It won't be super accurate, but requires no coding from you!
32
+ \"\"\")
33
+
34
+ uploaded_file = st.file_uploader("Upload CSV with 'Abstract' column")
35
+
36
+ if uploaded_file:
37
+ df = pd.read_csv(uploaded_file)
38
+ if "Abstract" not in df.columns:
39
+ st.error("CSV must have an 'Abstract' column.")
40
+ st.stop()
41
+
42
+ st.success("File uploaded successfully!")
43
+ if st.button("Run Analysis"):
44
+ with st.spinner("Analyzing each abstract..."):
45
+ df["Empirical Study"] = ""
46
+ df["Construct"] = ""
47
+ df["Sample Details"] = ""
48
+ df["Research Question"] = ""
49
+ df["Key Findings"] = ""
50
+ df["Variables"] = ""
51
+ df["Antecedents"] = ""
52
+ df["Outcomes"] = ""
53
+ df["Unit of Analysis"] = ""
54
+
55
+ for i, row in df.iterrows():
56
+ abstract = str(row["Abstract"])
57
+ df.at[i, "Empirical Study"] = classify_empirical(abstract)
58
+ if df.at[i, "Empirical Study"] == "Yes":
59
+ df.at[i, "Construct"] = find_constructs(abstract)
60
+ df.at[i, "Sample Details"] = extract_sample_details(abstract)
61
+ df.at[i, "Research Question"] = guess_research_question(abstract)
62
+ df.at[i, "Key Findings"] = guess_key_findings(abstract)
63
+ var, ants, outs = identify_variables(abstract)
64
+ df.at[i, "Variables"] = var
65
+ df.at[i, "Antecedents"] = ants
66
+ df.at[i, "Outcomes"] = outs
67
+ df.at[i, "Unit of Analysis"] = identify_unit_of_analysis(abstract)
68
+ else:
69
+ for col in ["Construct", "Sample Details", "Research Question", "Key Findings", "Variables", "Antecedents", "Outcomes", "Unit of Analysis"]:
70
+ df.at[i, col] = "N/A"
71
+
72
+ st.success("Done!")
73
+ st.dataframe(df.head(50))
74
+ csv_data = df.to_csv(index=False).encode("utf-8")
75
+ st.download_button("Download Analyzed CSV", data=csv_data, file_name="analysis_output.csv", mime="text/csv")
76
+
77
+
78
+ # === Functions ===
79
+
80
+ def classify_empirical(text):
81
+ candidate_labels = ["empirical study", "theoretical paper"]
82
+ res = zero_shot(text, candidate_labels)
83
+ top_label = res["labels"][0]
84
+ top_score = res["scores"][0]
85
+ if top_label == "empirical study" and top_score > 0.5:
86
+ return "Yes"
87
+ elif top_label == "theoretical paper" and top_score > 0.5:
88
+ return "No"
89
+ return "Unknown"
90
+
91
+ def find_constructs(text):
92
+ tokens = text.lower().split()
93
+ freq = {}
94
+ for w in tokens:
95
+ if len(w) > 5 and w.isalpha():
96
+ freq[w] = freq.get(w, 0) + 1
97
+ sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
98
+ if not sorted_freq:
99
+ return "Unknown"
100
+ return ", ".join([x[0] for x in sorted_freq[:2]])
101
+
102
+ def extract_sample_details(text):
103
+ import re
104
+ t = text.lower()
105
+ pattern = r"(n\s*=\s*\d+|sample of \d+|\d+\s+participants|\d+\s+subjects)"
106
+ matches = re.findall(pattern, t)
107
+ info = "; ".join([m[0] if isinstance(m, tuple) else m for m in matches]) if matches else ""
108
+ if "student" in t:
109
+ info += "; students"
110
+ if "employee" in t:
111
+ info += "; employees"
112
+ return info if info else "Unknown"
113
+
114
+ def guess_research_question(text):
115
+ lower = text.lower()
116
+ if "effect of" in lower:
117
+ idx = lower.index("effect of")
118
+ snippet = text[idx: idx+60]
119
+ return f"Does {snippet}?"
120
+ elif "aim of this study" in lower:
121
+ idx = lower.index("aim of this study")
122
+ snippet = text[idx: idx+60]
123
+ return snippet
124
+ return "Unknown"
125
+
126
+ def guess_key_findings(text):
127
+ lower = text.lower()
128
+ if "we find that" in lower:
129
+ idx = lower.index("we find that")
130
+ return text[idx: idx+100]
131
+ elif "results show" in lower:
132
+ idx = lower.index("results show")
133
+ return text[idx: idx+100]
134
+ return "Unknown"
135
+
136
+ def identify_variables(text):
137
+ import re
138
+ t = text.lower()
139
+ pattern = r"(impact|influence|effect) of (\w+) on (\w+)"
140
+ match = re.search(pattern, t)
141
+ if match:
142
+ iv = match.group(2)
143
+ dv = match.group(3)
144
+ return f"IV: {iv}, DV: {dv}", iv, dv
145
+ return "Unknown", "Unknown", "Unknown"
146
+
147
+ def identify_unit_of_analysis(text):
148
+ lower = text.lower()
149
+ if "team" in lower or "groups" in lower:
150
+ return "Team"
151
+ if "organization" in lower or "firm" in lower:
152
+ return "Organization"
153
+ if any(x in lower for x in ["participant", "individual", "student", "employee"]):
154
+ return "Individual"
155
+ return "Unknown"