File size: 4,446 Bytes
9dcbc08
ba2392f
5def19a
 
 
 
ba2392f
9dcbc08
5def19a
ee377c9
 
5def19a
 
 
 
 
 
 
 
ba2392f
 
 
 
ee377c9
 
ba2392f
 
5def19a
 
 
ba2392f
 
 
5def19a
 
ba2392f
5def19a
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
ba2392f
 
5def19a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abcabae
5def19a
 
abcabae
 
5def19a
 
 
1184219
abcabae
 
ba2392f
5def19a
 
 
ba2392f
abcabae
5def19a
 
abcabae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from torch.nn.functional import cosine_similarity
import torch

# Hugging Face ๋ชจ๋ธ
MODEL_REPO = "zhixiusue/EduTubeNavigator"

# YouTube API Key
YOUTUBE_API_KEY = "AIzaSyA8SG7--MfQvWET6UOam0PVAcC5MDm4sbc"
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# ID โ†’ ๋ผ๋ฒจ ๋งคํ•‘
token_label_map = {
    0: 'O', 1: 'B-TOPIC', 2: 'I-TOPIC', 3: 'B-STYLE', 4: 'I-STYLE',
    5: 'B-LENGTH', 6: 'I-LENGTH', 7: 'B-LANGUAGE', 8: 'I-LANGUAGE'
}

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_REPO)
    return tokenizer, model

def predict_entities(text, tokenizer, model):
    tokens = list(text)
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)[0]
    word_ids = inputs.word_ids(batch_index=0)
    entities = {}
    current_entity = ""
    current_type = ""
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        label = token_label_map[predictions[idx].item()]
        if label.startswith("B-"):
            if current_type:
                entities[current_type] = current_entity
            current_type = label[2:]
            current_entity = tokens[word_id]
        elif label.startswith("I-") and label[2:] == current_type:
            current_entity += tokens[word_id]
        else:
            if current_type:
                entities[current_type] = current_entity
                current_type = ""
                current_entity = ""
    if current_type:
        entities[current_type] = current_entity
    return entities

def search_youtube_videos(query, max_results=10):
    response = youtube.search().list(q=query, part="snippet", type="video", maxResults=max_results).execute()
    results = []
    for item in response['items']:
        results.append({
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description']
        })
    return results

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
        return " ".join([t['text'] for t in transcript])
    except:
        return ""

@st.cache_resource
def load_embedder():
    return SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

def embed_texts(embedder, texts):
    return embedder.encode(texts, convert_to_tensor=True)

def recommend_video(embedder, user_conditions, video_infos):
    user_text = " ".join([v for v in user_conditions.values() if v])
    user_embedding = embed_texts(embedder, [user_text])[0]

    scored = []
    for video in video_infos:
        video_text = video['title'] + " " + video['description'] + " " + get_transcript(video['video_id'])
        video_embedding = embed_texts(embedder, [video_text])[0]
        score = cosine_similarity(user_embedding, video_embedding, dim=0).item()
        scored.append((score, video))
    return sorted(scored, reverse=True, key=lambda x: x[0])[:3]

# Streamlit UI
st.title("EduTube Navigator")
st.write("ํ•™์Šต ๋ชฉํ‘œ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์กฐ๊ฑด์„ ์ถ”์ถœํ•˜๊ณ  ์œ ํŠœ๋ธŒ ์˜์ƒ์„ ์ถ”์ฒœํ•ฉ๋‹ˆ๋‹ค.")

user_input = st.text_input("ํ•™์Šต ๋ชฉํ‘œ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", "๋”ฅ๋Ÿฌ๋‹์„ ์‹ค์Šต ์œ„์ฃผ๋กœ 30๋ถ„ ์•ˆ์— ๋ฐฐ์šฐ๊ณ  ์‹ถ์–ด์š”")
if st.button("์ถ”์ฒœ ์‹œ์ž‘"):
    tokenizer, model = load_model()
    embedder = load_embedder()

    entities = predict_entities(user_input, tokenizer, model)
    #st.subheader("๐Ÿ“Œ ์ถ”์ถœ๋œ ์กฐ๊ฑด")
    #st.json(entities)

    search_query = " ".join([v for v in entities.values() if v])
    video_candidates = search_youtube_videos(search_query)
    top_recommendations = recommend_video(embedder, entities, video_candidates)

    st.subheader("์ถ”์ฒœ ์œ ํŠœ๋ธŒ ์˜์ƒ")
    for score, video in top_recommendations:
        st.markdown(f"**{video['title']}**  ")
        st.markdown(f"๐Ÿ”—[๋งํฌ](https://www.youtube.com/watch?v={video['video_id']})")