File size: 5,035 Bytes
c954503
e511bc5
 
c954503
e511bc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a46dba
e511bc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a09dd7
e511bc5
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import requests
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from datasets import Dataset

# Title and description
st.title("OSINT Tool 🏢")
st.markdown("""
    This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
    It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
    """)
    
# Sidebar for navigation
st.sidebar.title("Navigation")
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])

# GitHub Repository Analysis
if app_mode == "GitHub Repository Analysis":
    st.header("GitHub Repository Analysis")
    repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
    repo_name = st.text_input("Enter GitHub Repository Name", "transformers")
    
    if st.button("Analyze Repository"):
        if repo_owner and repo_name:
            try:
                response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
                data = response.json()
                
                if response.status_code == 200:
                    st.subheader("Repository Details")
                    st.write(f"**Name**: {data['name']}")
                    st.write(f"**Owner**: {data['owner']['login']}")
                    st.write(f"**Stars**: {data['stargazers_count']}")
                    st.write(f"**Forks**: {data['forks_count']}")
                    st.write(f"**Language**: {data['language']}")
                    st.write(f"**Description**: {data['description']}")
                else:
                    st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter both repository owner and name.")

# URL Title Fetcher
elif app_mode == "URL Title Fetcher":
    st.header("URL Title Fetcher")
    url = st.text_input("Enter URL", "https://www.huggingface.co")
    
    if st.button("Fetch Title"):
        if url:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    # Try to extract the title from the HTML
                    match = re.search('<title>(.*?)</title>', response.text)
                    if match:
                        title = match.group(1)
                        st.write(f"**Page Title**: {title}")
                    else:
                        st.warning("Title tag not found in the page")
                else:
                    st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter a valid URL.")

# Dataset Upload & Fine-Tuning
elif app_mode == "Dataset Upload & Fine-Tuning":
    st.header("Dataset Upload & Fine-Tuning")
    
    uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
    
    if uploaded_file is not None:
        # Load the CSV into a pandas DataFrame
        df = pd.read_csv(uploaded_file)
        
        # Display dataset preview
        st.subheader("Dataset Preview")
        st.write(df.head())

        # Convert CSV to Hugging Face dataset format
        dataset = Dataset.from_pandas(df)
        
        model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
        
        if st.button("Fine-tune Model"):
            if model_name:
                try:
                    model = AutoModelForSequenceClassification.from_pretrained(model_name)
                    tokenizer = AutoTokenizer.from_pretrained(model_name)
                    
                    # Prepare the dataset
                    def preprocess_function(examples):
                        return tokenizer(examples['text'], truncation=True, padding=True)
                    
                    tokenized_datasets = dataset.map(preprocess_function, batched=True)
                    
                    # Training loop (example)
                    train_args = {
                        "output_dir": "./results",
                        "num_train_epochs": 3,
                        "per_device_train_batch_size": 16,
                        "logging_dir": "./logs",
                    }
                    
                    # Fine-tuning logic (for demonstration purposes, actual fine-tuning will need Hugging Face Trainer)
                    # model.train()
                    
                    st.success("Fine-tuning started (demo)!")
                except Exception as e:
                    st.error(f"Error during fine-tuning: {e}")
            else:
                st.warning("Please select a model for fine-tuning.")
    
    else:
        st.warning("Please upload a dataset.")