Spaces:

shashwatashish
/

ArticleInsightDemo

Sleeping

App Files Files Community

ArticleInsightDemo / app.py

shashwatashish

Update app.py

6555a28 verified 26 days ago

raw

history blame contribute delete

5.68 kB

	import streamlit as st
	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

	MODEL_NAME = "valhalla/distilbart-mnli-12-1"
	device = 0 if torch.cuda.is_available() else -1

	@st.cache_resource
	def load_zero_shot_pipeline():
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
	return pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)

	zero_shot = load_zero_shot_pipeline()

	st.title("ArticleInsight (Demo Offline Pipeline)")

	st.markdown(\"\"\"
	Upload a CSV with an 'Abstract' column. We'll run a simple 8-step analysis:
	1. Empirical Study?
	2. Identify Construct
	3. Sample Details
	4. Main Research Question
	5. Key Findings
	6. Variables (IV/DV)
	7. Antecedents, Outcomes
	8. Unit of Analysis

	Disclaimer: This is a very naive demonstration using zero-shot classification and simple regex.
	It won't be super accurate, but requires no coding from you!
	\"\"\")

	uploaded_file = st.file_uploader("Upload CSV with 'Abstract' column")

	if uploaded_file:
	df = pd.read_csv(uploaded_file)
	if "Abstract" not in df.columns:
	st.error("CSV must have an 'Abstract' column.")
	st.stop()

	st.success("File uploaded successfully!")
	if st.button("Run Analysis"):
	with st.spinner("Analyzing each abstract..."):
	df["Empirical Study"] = ""
	df["Construct"] = ""
	df["Sample Details"] = ""
	df["Research Question"] = ""
	df["Key Findings"] = ""
	df["Variables"] = ""
	df["Antecedents"] = ""
	df["Outcomes"] = ""
	df["Unit of Analysis"] = ""

	for i, row in df.iterrows():
	abstract = str(row["Abstract"])
	df.at[i, "Empirical Study"] = classify_empirical(abstract)
	if df.at[i, "Empirical Study"] == "Yes":
	df.at[i, "Construct"] = find_constructs(abstract)
	df.at[i, "Sample Details"] = extract_sample_details(abstract)
	df.at[i, "Research Question"] = guess_research_question(abstract)
	df.at[i, "Key Findings"] = guess_key_findings(abstract)
	var, ants, outs = identify_variables(abstract)
	df.at[i, "Variables"] = var
	df.at[i, "Antecedents"] = ants
	df.at[i, "Outcomes"] = outs
	df.at[i, "Unit of Analysis"] = identify_unit_of_analysis(abstract)
	else:
	for col in ["Construct", "Sample Details", "Research Question", "Key Findings", "Variables", "Antecedents", "Outcomes", "Unit of Analysis"]:
	df.at[i, col] = "N/A"

	st.success("Done!")
	st.dataframe(df.head(50))
	csv_data = df.to_csv(index=False).encode("utf-8")
	st.download_button("Download Analyzed CSV", data=csv_data, file_name="analysis_output.csv", mime="text/csv")


	# === Functions ===

	def classify_empirical(text):
	candidate_labels = ["empirical study", "theoretical paper"]
	res = zero_shot(text, candidate_labels)
	top_label = res["labels"][0]
	top_score = res["scores"][0]
	if top_label == "empirical study" and top_score > 0.5:
	return "Yes"
	elif top_label == "theoretical paper" and top_score > 0.5:
	return "No"
	return "Unknown"

	def find_constructs(text):
	tokens = text.lower().split()
	freq = {}
	for w in tokens:
	if len(w) > 5 and w.isalpha():
	freq[w] = freq.get(w, 0) + 1
	sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
	if not sorted_freq:
	return "Unknown"
	return ", ".join([x[0] for x in sorted_freq[:2]])

	def extract_sample_details(text):
	import re
	t = text.lower()
	pattern = r"(n\s=\s\d+\|sample of \d+\|\d+\s+participants\|\d+\s+subjects)"
	matches = re.findall(pattern, t)
	info = "; ".join([m[0] if isinstance(m, tuple) else m for m in matches]) if matches else ""
	if "student" in t:
	info += "; students"
	if "employee" in t:
	info += "; employees"
	return info if info else "Unknown"

	def guess_research_question(text):
	lower = text.lower()
	if "effect of" in lower:
	idx = lower.index("effect of")
	snippet = text[idx: idx+60]
	return f"Does {snippet}?"
	elif "aim of this study" in lower:
	idx = lower.index("aim of this study")
	snippet = text[idx: idx+60]
	return snippet
	return "Unknown"

	def guess_key_findings(text):
	lower = text.lower()
	if "we find that" in lower:
	idx = lower.index("we find that")
	return text[idx: idx+100]
	elif "results show" in lower:
	idx = lower.index("results show")
	return text[idx: idx+100]
	return "Unknown"

	def identify_variables(text):
	import re
	t = text.lower()
	pattern = r"(impact\|influence\|effect) of (\w+) on (\w+)"
	match = re.search(pattern, t)
	if match:
	iv = match.group(2)
	dv = match.group(3)
	return f"IV: {iv}, DV: {dv}", iv, dv
	return "Unknown", "Unknown", "Unknown"

	def identify_unit_of_analysis(text):
	lower = text.lower()
	if "team" in lower or "groups" in lower:
	return "Team"
	if "organization" in lower or "firm" in lower:
	return "Organization"
	if any(x in lower for x in ["participant", "individual", "student", "employee"]):
	return "Individual"
	return "Unknown"

	import os
	os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
	os.environ["STREAMLIT_SERVER_ADDRESS"] = "0.0.0.0"