Spaces:

zhixiusue
/

Edutube

Sleeping

File size: 4,446 Bytes

9dcbc08
ba2392f
5def19a
 
 
 
ba2392f
9dcbc08
5def19a
ee377c9
 
5def19a
 
 
 
 
 
 
 
ba2392f
 
 
 
ee377c9
 
ba2392f
 
5def19a
 
 
ba2392f
 
 
5def19a
 
ba2392f
5def19a
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
ba2392f
 
5def19a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2392f
5def19a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abcabae
5def19a
 
abcabae
 
5def19a
 
 
1184219
abcabae
 
ba2392f
5def19a
 
 
ba2392f
abcabae
5def19a
 
abcabae

import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from torch.nn.functional import cosine_similarity
import torch

# Hugging Face 모델
MODEL_REPO = "zhixiusue/EduTubeNavigator"

# YouTube API Key
YOUTUBE_API_KEY = "AIzaSyA8SG7--MfQvWET6UOam0PVAcC5MDm4sbc"
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# ID → 라벨 매핑
token_label_map = {
    0: 'O', 1: 'B-TOPIC', 2: 'I-TOPIC', 3: 'B-STYLE', 4: 'I-STYLE',
    5: 'B-LENGTH', 6: 'I-LENGTH', 7: 'B-LANGUAGE', 8: 'I-LANGUAGE'
}

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_REPO)
    return tokenizer, model

def predict_entities(text, tokenizer, model):
    tokens = list(text)
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)[0]
    word_ids = inputs.word_ids(batch_index=0)
    entities = {}
    current_entity = ""
    current_type = ""
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        label = token_label_map[predictions[idx].item()]
        if label.startswith("B-"):
            if current_type:
                entities[current_type] = current_entity
            current_type = label[2:]
            current_entity = tokens[word_id]
        elif label.startswith("I-") and label[2:] == current_type:
            current_entity += tokens[word_id]
        else:
            if current_type:
                entities[current_type] = current_entity
                current_type = ""
                current_entity = ""
    if current_type:
        entities[current_type] = current_entity
    return entities

def search_youtube_videos(query, max_results=10):
    response = youtube.search().list(q=query, part="snippet", type="video", maxResults=max_results).execute()
    results = []
    for item in response['items']:
        results.append({
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description']
        })
    return results

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
        return " ".join([t['text'] for t in transcript])
    except:
        return ""

@st.cache_resource
def load_embedder():
    return SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

def embed_texts(embedder, texts):
    return embedder.encode(texts, convert_to_tensor=True)

def recommend_video(embedder, user_conditions, video_infos):
    user_text = " ".join([v for v in user_conditions.values() if v])
    user_embedding = embed_texts(embedder, [user_text])[0]

    scored = []
    for video in video_infos:
        video_text = video['title'] + " " + video['description'] + " " + get_transcript(video['video_id'])
        video_embedding = embed_texts(embedder, [video_text])[0]
        score = cosine_similarity(user_embedding, video_embedding, dim=0).item()
        scored.append((score, video))
    return sorted(scored, reverse=True, key=lambda x: x[0])[:3]

# Streamlit UI
st.title("EduTube Navigator")
st.write("학습 목표를 입력하면 조건을 추출하고 유튜브 영상을 추천합니다.")

user_input = st.text_input("학습 목표를 입력하세요", "딥러닝을 실습 위주로 30분 안에 배우고 싶어요")
if st.button("추천 시작"):
    tokenizer, model = load_model()
    embedder = load_embedder()

    entities = predict_entities(user_input, tokenizer, model)
    #st.subheader("📌 추출된 조건")
    #st.json(entities)

    search_query = " ".join([v for v in entities.values() if v])
    video_candidates = search_youtube_videos(search_query)
    top_recommendations = recommend_video(embedder, entities, video_candidates)

    st.subheader("추천 유튜브 영상")
    for score, video in top_recommendations:
        st.markdown(f"**{video['title']}**  ")
        st.markdown(f"🔗[링크](https://www.youtube.com/watch?v={video['video_id']})")