Rivalcoder commited on
Commit
cbf58a5
·
1 Parent(s): 033ac17
Files changed (3) hide show
  1. Dockerfile +19 -0
  2. app.py +44 -0
  3. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install Chrome
4
+ RUN apt-get update && apt-get install -y wget gnupg unzip curl \
5
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
6
+ && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \
7
+ && apt-get update \
8
+ && apt-get install -y google-chrome-stable
9
+
10
+ # Install dependencies
11
+ COPY requirements.txt /app/requirements.txt
12
+ WORKDIR /app
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy app code
16
+ COPY app.py /app/app.py
17
+
18
+ # Run the app
19
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import gradio as gr
3
+ from selenium.webdriver.common.by import By
4
+ import undetected_chromedriver as uc
5
+
6
+ def get_captions_selenium(video_url):
7
+ try:
8
+ # Launch browser
9
+ options = uc.ChromeOptions()
10
+ options.add_argument("--headless")
11
+ options.add_argument("--no-sandbox")
12
+ options.add_argument("--disable-dev-shm-usage")
13
+
14
+ driver = uc.Chrome(options=options)
15
+
16
+ driver.get(video_url)
17
+ time.sleep(5)
18
+
19
+ # Click "..." -> "Open transcript"
20
+ # YouTube UI changes often; this is just an example. May need tuning.
21
+
22
+ # Try to find subtitles in the page source (for auto-generated)
23
+ page_source = driver.page_source
24
+ if "captionTracks" in page_source:
25
+ start = page_source.find("captionTracks")
26
+ end = page_source.find("]", start) + 1
27
+ caption_json = page_source[start:end]
28
+ driver.quit()
29
+ return "✅ Found potential captions info in page source (you may need to parse this JSON)."
30
+ else:
31
+ driver.quit()
32
+ return "⚠️ Captions info not found in source. May not be available or blocked."
33
+
34
+ except Exception as e:
35
+ return f"❌ Error: {str(e)}"
36
+
37
+ # Gradio interface
38
+ gr.Interface(
39
+ fn=get_captions_selenium,
40
+ inputs=[gr.Textbox(label="YouTube Video URL")],
41
+ outputs="text",
42
+ title="YouTube Captions Scraper (Selenium)",
43
+ description="Extract captions using headless browser via Selenium."
44
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ selenium
3
+ undetected-chromedriver