Spaces:

Chemically-motivated
/

OSINT_Tool

Running

App Files Files Community

OSINT_Tool / app.py

Canstralian

Update app.py

e511bc5 verified 4 months ago

raw

history blame

5.04 kB

	import streamlit as st
	import requests
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import torch
	import pandas as pd
	from datasets import Dataset

	# Title and description
	st.title("OSINT Tool 🏢")
	st.markdown("""
	This tool performs Open Source Intelligence (OSINT) analysis on GitHub repositories and fetches titles from URLs.
	It also allows uploading datasets (CSV format) for fine-tuning models like DistilBERT.
	""")

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])

	# GitHub Repository Analysis
	if app_mode == "GitHub Repository Analysis":
	st.header("GitHub Repository Analysis")
	repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
	repo_name = st.text_input("Enter GitHub Repository Name", "transformers")

	if st.button("Analyze Repository"):
	if repo_owner and repo_name:
	try:
	response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
	data = response.json()

	if response.status_code == 200:
	st.subheader("Repository Details")
	st.write(f"Name: {data['name']}")
	st.write(f"Owner: {data['owner']['login']}")
	st.write(f"Stars: {data['stargazers_count']}")
	st.write(f"Forks: {data['forks_count']}")
	st.write(f"Language: {data['language']}")
	st.write(f"Description: {data['description']}")
	else:
	st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
	except Exception as e:
	st.error(f"Error occurred: {e}")
	else:
	st.warning("Please enter both repository owner and name.")

	# URL Title Fetcher
	elif app_mode == "URL Title Fetcher":
	st.header("URL Title Fetcher")
	url = st.text_input("Enter URL", "https://www.huggingface.co")

	if st.button("Fetch Title"):
	if url:
	try:
	response = requests.get(url)
	if response.status_code == 200:
	# Try to extract the title from the HTML
	match = re.search('<title>(.*?)</title>', response.text)
	if match:
	title = match.group(1)
	st.write(f"Page Title: {title}")
	else:
	st.warning("Title tag not found in the page")
	else:
	st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
	except Exception as e:
	st.error(f"Error occurred: {e}")
	else:
	st.warning("Please enter a valid URL.")

	# Dataset Upload & Fine-Tuning
	elif app_mode == "Dataset Upload & Fine-Tuning":
	st.header("Dataset Upload & Fine-Tuning")

	uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")

	if uploaded_file is not None:
	# Load the CSV into a pandas DataFrame
	df = pd.read_csv(uploaded_file)

	# Display dataset preview
	st.subheader("Dataset Preview")
	st.write(df.head())

	# Convert CSV to Hugging Face dataset format
	dataset = Dataset.from_pandas(df)

	model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])

	if st.button("Fine-tune Model"):
	if model_name:
	try:
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Prepare the dataset
	def preprocess_function(examples):
	return tokenizer(examples['text'], truncation=True, padding=True)

	tokenized_datasets = dataset.map(preprocess_function, batched=True)

	# Training loop (example)
	train_args = {
	"output_dir": "./results",
	"num_train_epochs": 3,
	"per_device_train_batch_size": 16,
	"logging_dir": "./logs",
	}

	# Fine-tuning logic (for demonstration purposes, actual fine-tuning will need Hugging Face Trainer)
	# model.train()

	st.success("Fine-tuning started (demo)!")
	except Exception as e:
	st.error(f"Error during fine-tuning: {e}")
	else:
	st.warning("Please select a model for fine-tuning.")

	else:
	st.warning("Please upload a dataset.")