Spaces:

Chemically-motivated
/

OSINT_Tool

Running

File size: 5,035 Bytes

import streamlit as st
import requests
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from datasets import Dataset

# Title and description
st.title("OSINT Tool 🏢")
st.markdown("""
    This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
    It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
    """)
    
# Sidebar for navigation
st.sidebar.title("Navigation")
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])

# GitHub Repository Analysis
if app_mode == "GitHub Repository Analysis":
    st.header("GitHub Repository Analysis")
    repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
    repo_name = st.text_input("Enter GitHub Repository Name", "transformers")
    
    if st.button("Analyze Repository"):
        if repo_owner and repo_name:
            try:
                response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
                data = response.json()
                
                if response.status_code == 200:
                    st.subheader("Repository Details")
                    st.write(f"**Name**: {data['name']}")
                    st.write(f"**Owner**: {data['owner']['login']}")
                    st.write(f"**Stars**: {data['stargazers_count']}")
                    st.write(f"**Forks**: {data['forks_count']}")
                    st.write(f"**Language**: {data['language']}")
                    st.write(f"**Description**: {data['description']}")
                else:
                    st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter both repository owner and name.")

# URL Title Fetcher
elif app_mode == "URL Title Fetcher":
    st.header("URL Title Fetcher")
    url = st.text_input("Enter URL", "https://www.huggingface.co")
    
    if st.button("Fetch Title"):
        if url:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    # Try to extract the title from the HTML
                    match = re.search('<title>(.*?)</title>', response.text)
                    if match:
                        title = match.group(1)
                        st.write(f"**Page Title**: {title}")
                    else:
                        st.warning("Title tag not found in the page")
                else:
                    st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter a valid URL.")

# Dataset Upload & Fine-Tuning
elif app_mode == "Dataset Upload & Fine-Tuning":
    st.header("Dataset Upload & Fine-Tuning")
    
    uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
    
    if uploaded_file is not None:
        # Load the CSV into a pandas DataFrame
        df = pd.read_csv(uploaded_file)
        
        # Display dataset preview
        st.subheader("Dataset Preview")
        st.write(df.head())

        # Convert CSV to Hugging Face dataset format
        dataset = Dataset.from_pandas(df)
        
        model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
        
        if st.button("Fine-tune Model"):
            if model_name:
                try:
                    model = AutoModelForSequenceClassification.from_pretrained(model_name)
                    tokenizer = AutoTokenizer.from_pretrained(model_name)
                    
                    # Prepare the dataset
                    def preprocess_function(examples):
                        return tokenizer(examples['text'], truncation=True, padding=True)
                    
                    tokenized_datasets = dataset.map(preprocess_function, batched=True)
                    
                    # Training loop (example)
                    train_args = {
                        "output_dir": "./results",
                        "num_train_epochs": 3,
                        "per_device_train_batch_size": 16,
                        "logging_dir": "./logs",
                    }
                    
                    # Fine-tuning logic (for demonstration purposes, actual fine-tuning will need Hugging Face Trainer)
                    # model.train()
                    
                    st.success("Fine-tuning started (demo)!")
                except Exception as e:
                    st.error(f"Error during fine-tuning: {e}")
            else:
                st.warning("Please select a model for fine-tuning.")
    
    else:
        st.warning("Please upload a dataset.")