In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://lightning.ai/docs/pytorch/latest/starter/introduction.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    div_content = soup.find('div', class_='rst-content')

    if div_content:
        sections = div_content.find_all('section')

        for section in sections[1:]:

            section_content = section.get_text()
            print(section_content)
            print('-------------------')
    else:
        print("Div element with class 'rst-content' not found. Check the HTML structure of the page.")

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")





1: Install PyTorch Lightning¶

For pip users
pip install lightning



For conda users
conda install lightning -c conda-forge



Or read the advanced install guide


-------------------

2: Define a LightningModule¶
A LightningModule enables your PyTorch nn.Module to play together in complex ways inside the training_step (there is also an optional validation_step and test_step).
import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
import lightning as L

# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))


# define the LightningModule
class LitAutoEncoder(L.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch

In [3]:
from urllib.parse import urlparse, urlunparse,urljoin

In [4]:
def remove_fragment(url):
    parser_url = urlparse(url)

    new_url = urlunparse(parser_url._replace(fragment=""))
    return new_url

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

crawled_urls = set()


# Function to fetch and extract links from a page
def get_links(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            links = []
            for a_tag in soup.find_all("a", href=True):
                link = a_tag["href"]
                links.append(link)
            return links
    except Exception as e:
        print(f"Failed to fetch links from {url}: {e}")
    return []


# Function to recursively fetch links within the same domain
def fetch_links_recursive(base_url, current_url, visited_urls, max_depth=4):
    if current_url in visited_urls or max_depth == 0:
        return

    visited_urls.add(current_url)
    links = get_links(current_url)
    for link in links:
        absolute_url = urljoin(current_url, link)
        parsed_url = urlparse(absolute_url)
        if parsed_url.netloc == base_url.netloc:
            crawled_urls.add(remove_fragment(absolute_url))
            fetch_links_recursive(base_url, absolute_url, visited_urls, max_depth - 1)

In [6]:
base_url = urlparse(
    "https://lightning.ai/docs/pytorch/latest/starter/introduction.html"
)
visited_urls = set()
fetch_links_recursive(base_url, base_url.geturl(), visited_urls)

In [7]:
len(crawled_urls)

275

In [8]:
crawled_urls

{'https://lightning.ai/docs/fabric/',
 'https://lightning.ai/docs/pytorch/latest/_images/custom_loop.png',
 'https://lightning.ai/docs/pytorch/latest/_images/ddp.gif',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/fabric/utilities/throughput.html',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/pytorch/core/module.html',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/pytorch/trainer/trainer.html',
 'https://lightning.ai/docs/pytorch/latest/_sources/accelerators/gpu.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/accelerators/tpu.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/advanced/speed.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/api_references.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/checkpointing.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/index.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/lightning_module.rst.txt

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


def extract_sections_to_csv(url, output_file):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the div element with class "rst-content"
        div_content = soup.find("div", class_="rst-content")

        if div_content:
            # Find all section tags within the div_content
            sections = div_content.find_all("section")

            # Create a list to store the sections
            section_contents = []

            for section in sections[1:]:
                # Extract the content of each section
                section_content = section.get_text()
                section_contents.append(section_content)

            # Create a DataFrame with URL and Section Content columns
            df = pd.DataFrame(
                {
                    "URL": [url] * len(section_contents),
                    "Section Content": section_contents,
                }
            )

            # Save the DataFrame to a CSV file
            df.to_csv(output_file, index=False)

        else:
            print(
                "Div element with class 'rst-content' not found. Check the HTML structure of the page."
            )

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")


# Example usage
url = "https://lightning.ai/docs/pytorch/latest"
output_file = "sections.csv"
extract_sections_to_csv(url, output_file)

In [10]:
from rich.progress import track

for i, url in enumerate(track(crawled_urls)):
    output_file = f"E:/Projects/Hackathons/StudyBot/crawled/{i}.csv"
    extract_sections_to_csv(url, output_file)

Output()

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
from glob import glob
import pinecone

  from tqdm.autonotebook import tqdm


In [12]:
pinecone.init(api_key="2c94d392-e7de-40d6-8dbc-b485fac62af2", environment="gcp-starter")

In [13]:
from chromadb.utils import embedding_functions


import pandas as pd
from glob import glob

In [14]:
import chromadb

chroma_client = chromadb.PersistentClient(path="db")

# collection = chroma_client.create_collection(name="test")
collection = chroma_client.get_collection(name="test")

In [15]:
csvs = glob("crawled/*.csv")

In [16]:
from rich.progress import track
from rich import print
from os.path import basename

In [17]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

False

The following directories listed in your path were found to be non-existent: {WindowsPath('/Anaconda3/envs/pytorchML/lib'), WindowsPath('D')}
The following directories listed in your path were found to be non-existent: {WindowsPath('vs/workbench/api/node/extensionHostProcess')}
The following directories listed in your path were found to be non-existent: {WindowsPath('module'), WindowsPath('/matplotlib_inline.backend_inline')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}
DEBUG: Possible options found for libcudart.so: set()
CUDA SETUP: PyTorch settings found: CUDA_VERSION=117, Highest Compute Capability: 7.5.
CUDA SETUP: To manually override the PyTorch CUDA version please see:https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md
CUDA SETUP: Loading binary d:\Anaconda3\envs\pytorchML\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.so...
argument of type 'WindowsPath' is not


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):

        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [None]:
for csv in track(csvs):
    