Spaces:
Runtime error
Runtime error
Rivalcoder
commited on
Commit
·
cbf58a5
1
Parent(s):
033ac17
Add
Browse files- Dockerfile +19 -0
- app.py +44 -0
- requirements.txt +3 -0
Dockerfile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
# Install Chrome
|
4 |
+
RUN apt-get update && apt-get install -y wget gnupg unzip curl \
|
5 |
+
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
6 |
+
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \
|
7 |
+
&& apt-get update \
|
8 |
+
&& apt-get install -y google-chrome-stable
|
9 |
+
|
10 |
+
# Install dependencies
|
11 |
+
COPY requirements.txt /app/requirements.txt
|
12 |
+
WORKDIR /app
|
13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
+
|
15 |
+
# Copy app code
|
16 |
+
COPY app.py /app/app.py
|
17 |
+
|
18 |
+
# Run the app
|
19 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import gradio as gr
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
import undetected_chromedriver as uc
|
5 |
+
|
6 |
+
def get_captions_selenium(video_url):
|
7 |
+
try:
|
8 |
+
# Launch browser
|
9 |
+
options = uc.ChromeOptions()
|
10 |
+
options.add_argument("--headless")
|
11 |
+
options.add_argument("--no-sandbox")
|
12 |
+
options.add_argument("--disable-dev-shm-usage")
|
13 |
+
|
14 |
+
driver = uc.Chrome(options=options)
|
15 |
+
|
16 |
+
driver.get(video_url)
|
17 |
+
time.sleep(5)
|
18 |
+
|
19 |
+
# Click "..." -> "Open transcript"
|
20 |
+
# YouTube UI changes often; this is just an example. May need tuning.
|
21 |
+
|
22 |
+
# Try to find subtitles in the page source (for auto-generated)
|
23 |
+
page_source = driver.page_source
|
24 |
+
if "captionTracks" in page_source:
|
25 |
+
start = page_source.find("captionTracks")
|
26 |
+
end = page_source.find("]", start) + 1
|
27 |
+
caption_json = page_source[start:end]
|
28 |
+
driver.quit()
|
29 |
+
return "✅ Found potential captions info in page source (you may need to parse this JSON)."
|
30 |
+
else:
|
31 |
+
driver.quit()
|
32 |
+
return "⚠️ Captions info not found in source. May not be available or blocked."
|
33 |
+
|
34 |
+
except Exception as e:
|
35 |
+
return f"❌ Error: {str(e)}"
|
36 |
+
|
37 |
+
# Gradio interface
|
38 |
+
gr.Interface(
|
39 |
+
fn=get_captions_selenium,
|
40 |
+
inputs=[gr.Textbox(label="YouTube Video URL")],
|
41 |
+
outputs="text",
|
42 |
+
title="YouTube Captions Scraper (Selenium)",
|
43 |
+
description="Extract captions using headless browser via Selenium."
|
44 |
+
).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
selenium
|
3 |
+
undetected-chromedriver
|