Spaces:
Running
Running
import streamlit as st | |
from transcript_extractor import get_transcript | |
import logging | |
import sys | |
# Configure logging for stdout only | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
stream=sys.stdout | |
) | |
logger = logging.getLogger(__name__) | |
def process_single_video(db_handler, data_processor, video_id, embedding_model): | |
"""Process a single video for indexing""" | |
try: | |
# Check for existing index | |
existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id) | |
if existing_index: | |
logger.info(f"Video {video_id} already processed. Using existing index.") | |
return existing_index | |
# Get transcript data | |
transcript_data = get_transcript(video_id) | |
if not transcript_data: | |
logger.error(f"Failed to retrieve transcript for video {video_id}") | |
return None | |
# Process transcript | |
processed_data = data_processor.process_transcript(video_id, transcript_data) | |
if not processed_data: | |
logger.error(f"Failed to process transcript for video {video_id}") | |
return None | |
# Prepare video data | |
video_data = { | |
'video_id': video_id, | |
'title': transcript_data['metadata'].get('title', 'Unknown Title'), | |
'author': transcript_data['metadata'].get('author', 'Unknown Author'), | |
'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'), | |
'view_count': int(transcript_data['metadata'].get('view_count', 0)), | |
'like_count': int(transcript_data['metadata'].get('like_count', 0)), | |
'comment_count': int(transcript_data['metadata'].get('comment_count', 0)), | |
'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'), | |
'transcript_content': processed_data['content'] | |
} | |
# Save to database | |
db_handler.add_video(video_data) | |
# Build index | |
index_name = f"video_{video_id}_{embedding_model}".lower() | |
index_name = data_processor.build_index(index_name) | |
if index_name: | |
# Save index information | |
embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model") | |
video_record = db_handler.get_video_by_youtube_id(video_id) | |
if video_record: | |
db_handler.add_elasticsearch_index(video_record[0], index_name, embedding_model_id) | |
logger.info(f"Successfully processed video: {video_data['title']}") | |
return index_name | |
logger.error(f"Failed to process video {video_id}") | |
return None | |
except Exception as e: | |
logger.error(f"Error processing video {video_id}: {str(e)}") | |
return None |