Spaces:

Abhaykumar04
/

Smart-Course-Search

Sleeping

App Files Files Community

Smart-Course-Search / scrape_data.py

Abhaykumar04

Upload 8 files

3c1118a verified 4 months ago

raw

history blame

2.79 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import os


	BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page="


	OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json")

	def scrape_courses():
	courses = []


	for page in range(1, 10):

	URL = f"{BASE_URL}{page}"
	print(f"Scraping URL: {URL}")


	response = requests.get(URL)
	print(f"Response status: {response.status_code}")

	# Check if request was successful
	if response.status_code != 200:
	print(f"Failed to fetch the webpage. Status code: {response.status_code}")
	continue


	soup = BeautifulSoup(response.content, "html.parser")

	# Locate course containers
	course_items = soup.find_all("li", class_="products__list-item")
	print(f"Found {len(course_items)} course containers on page {page}.")

	# Loop through each course container to extract details
	for item in course_items:
	# Extract course link
	link_tag = item.find("a", class_="course-card")
	course_link = link_tag.get("href", "#") if link_tag else "#"
	if not course_link.startswith("http"):
	course_link = f"https://courses.analyticsvidhya.com{course_link}"

	# Extract course title
	title_tag = link_tag.find("h3") if link_tag else None
	title = title_tag.text.strip() if title_tag else "No Title"

	# Extract course image
	image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None
	image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL"

	# Extract course description
	lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None
	description = lesson_tag.text.strip() if lesson_tag else "No Description"

	# Add the extracted details to the list
	courses.append({
	"title": title,
	"description": description,
	"image_url": image_url,
	"course_link": course_link,
	})

	# Debugging: Print the first few courses
	print(f"Scraped {len(courses)} courses.")
	for course in courses[:3]:
	print(course)

	# Ensure the directory for the output file exists
	os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

	# Save the course data to a JSON file
	with open(OUTPUT_FILE, "w") as f:
	json.dump(courses, f, indent=4)

	print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}")

	if __name__ == "__main__":
	scrape_courses()