File size: 3,105 Bytes
dc56243
 
 
 
 
 
5d9493d
 
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
5d9493d
dc56243
 
5d9493d
 
dc56243
5d9493d
 
 
 
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
5d9493d
 
 
 
 
 
 
 
 
 
 
 
dc56243
5d9493d
 
 
dc56243
5d9493d
 
 
 
dc56243
 
5d9493d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc56243
 
5d9493d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import requests
from datasets import load_dataset

# Set page configuration
st.set_page_config(page_title="Repository Recommender", layout="wide")

# Load model and tokenizer
@st.cache_resource
def load_model():
    model_name = "Salesforce/codet5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to("cuda")
    return tokenizer, model

def generate_embedding(text, tokenizer, model):
    """Generate embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Load dataset
@st.cache_data
def load_data():
    dataset = load_dataset("frankjosh/filtered_dataset", split="train")
    df = pd.DataFrame(dataset).head(500)  # Limit to 500 repositories
    return df

def fetch_readme(repo_url):
    """Fetch README file from GitHub repository."""
    try:
        readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
        response = requests.get(readme_url)
        if response.status_code == 200:
            return response.text
        else:
            return "README not available."
    except Exception as e:
        return f"Error fetching README: {e}"

# Main application logic
def main():
    st.title("Repository Recommender System")
    st.write("Find Python repositories to learn production-level coding practices.")

    # Load resources
    tokenizer, model = load_model()
    data = load_data()

    # Input user query
    user_query = st.text_input("Describe your project or learning goal:",
                               "I am working on a project to recommend music using pandas and numpy.")
    if user_query:
        query_embedding = generate_embedding(user_query, tokenizer, model)

        # Compute similarity
        data['similarity'] = data['embedding'].apply(
            lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
        )

        # Filter and sort recommendations
        top_recommendations = (
            data.sort_values(by='similarity', ascending=False)
            .head(5)
        )

        # Display recommendations
        st.subheader("Top Recommendations")
        for idx, row in top_recommendations.iterrows():
            st.markdown(f"### {row['repo']}")
            st.write(f"**Path:** {row['path']}")
            st.write(f"**Summary:** {row['summary']}")
            st.write(f"**Similarity Score:** {row['similarity']:.2f}")
            st.markdown(f"[Repository Link]({row['url']})")

            # Fetch and display README
            st.subheader("Repository README")
            readme_content = fetch_readme(row['url'])
            st.code(readme_content)

if __name__ == "__main__":
    main()