Rivalcoder commited on
Commit
a0bdeee
Β·
1 Parent(s): cbf58a5
Files changed (2) hide show
  1. Dockerfile +25 -9
  2. app.py +10 -9
Dockerfile CHANGED
@@ -1,19 +1,35 @@
1
  FROM python:3.10-slim
2
 
 
 
 
3
  # Install Chrome
4
- RUN apt-get update && apt-get install -y wget gnupg unzip curl \
5
- && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
6
- && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \
7
- && apt-get update \
8
- && apt-get install -y google-chrome-stable
9
-
10
- # Install dependencies
11
- COPY requirements.txt /app/requirements.txt
 
 
 
 
 
 
 
 
 
 
12
  WORKDIR /app
 
 
 
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
  # Copy app code
16
- COPY app.py /app/app.py
17
 
18
  # Run the app
19
  CMD ["python", "app.py"]
 
1
  FROM python:3.10-slim
2
 
3
+ # Install Chrome and dependencies
4
+ RUN apt-get update && apt-get install -y wget gnupg unzip curl ca-certificates fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1
5
+
6
  # Install Chrome
7
+ RUN wget -q -O google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
8
+ && apt install -y ./google-chrome.deb \
9
+ && rm google-chrome.deb
10
+
11
+ # Install matching ChromeDriver
12
+ RUN CHROME_VERSION=$(google-chrome --version | grep -oP '\d+\.\d+\.\d+') \
13
+ && CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_VERSION") \
14
+ && wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip \
15
+ && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
16
+ && chmod +x /usr/local/bin/chromedriver \
17
+ && rm /tmp/chromedriver.zip
18
+
19
+ # Set environment variables
20
+ ENV PATH="/usr/local/bin:$PATH"
21
+ ENV CHROME_BIN="/usr/bin/google-chrome"
22
+ ENV CHROMEDRIVER="/usr/local/bin/chromedriver"
23
+
24
+ # Set working dir
25
  WORKDIR /app
26
+
27
+ # Install Python dependencies
28
+ COPY requirements.txt .
29
  RUN pip install --no-cache-dir -r requirements.txt
30
 
31
  # Copy app code
32
+ COPY app.py .
33
 
34
  # Run the app
35
  CMD ["python", "app.py"]
app.py CHANGED
@@ -5,33 +5,34 @@ import undetected_chromedriver as uc
5
 
6
  def get_captions_selenium(video_url):
7
  try:
8
- # Launch browser
9
  options = uc.ChromeOptions()
10
- options.add_argument("--headless")
11
  options.add_argument("--no-sandbox")
12
  options.add_argument("--disable-dev-shm-usage")
13
 
14
  driver = uc.Chrome(options=options)
15
-
16
  driver.get(video_url)
17
- time.sleep(5)
18
 
19
- # Click "..." -> "Open transcript"
20
- # YouTube UI changes often; this is just an example. May need tuning.
21
 
22
- # Try to find subtitles in the page source (for auto-generated)
23
  page_source = driver.page_source
 
24
  if "captionTracks" in page_source:
25
  start = page_source.find("captionTracks")
26
  end = page_source.find("]", start) + 1
27
  caption_json = page_source[start:end]
28
  driver.quit()
29
- return "βœ… Found potential captions info in page source (you may need to parse this JSON)."
30
  else:
31
  driver.quit()
32
  return "⚠️ Captions info not found in source. May not be available or blocked."
33
 
34
  except Exception as e:
 
35
  return f"❌ Error: {str(e)}"
36
 
37
  # Gradio interface
@@ -40,5 +41,5 @@ gr.Interface(
40
  inputs=[gr.Textbox(label="YouTube Video URL")],
41
  outputs="text",
42
  title="YouTube Captions Scraper (Selenium)",
43
- description="Extract captions using headless browser via Selenium."
44
  ).launch()
 
5
 
6
  def get_captions_selenium(video_url):
7
  try:
8
+ print("πŸš€ Launching Chrome...")
9
  options = uc.ChromeOptions()
10
+ options.add_argument("--headless=new") # Use 'new' headless mode for Chrome 109+
11
  options.add_argument("--no-sandbox")
12
  options.add_argument("--disable-dev-shm-usage")
13
 
14
  driver = uc.Chrome(options=options)
15
+ print("🌍 Navigating to video...")
16
  driver.get(video_url)
 
17
 
18
+ print("βŒ› Waiting for page to load...")
19
+ time.sleep(5)
20
 
21
+ print("πŸ“„ Scraping page source...")
22
  page_source = driver.page_source
23
+
24
  if "captionTracks" in page_source:
25
  start = page_source.find("captionTracks")
26
  end = page_source.find("]", start) + 1
27
  caption_json = page_source[start:end]
28
  driver.quit()
29
+ return "βœ… Found potential captions info in page source (you may need to parse this JSON).\n\n" + caption_json
30
  else:
31
  driver.quit()
32
  return "⚠️ Captions info not found in source. May not be available or blocked."
33
 
34
  except Exception as e:
35
+ print(f"❌ Exception occurred: {e}")
36
  return f"❌ Error: {str(e)}"
37
 
38
  # Gradio interface
 
41
  inputs=[gr.Textbox(label="YouTube Video URL")],
42
  outputs="text",
43
  title="YouTube Captions Scraper (Selenium)",
44
+ description="Uses Selenium with undetected-chromedriver to extract captions from a YouTube video."
45
  ).launch()