Spaces:
Build error
Build error
import gradio as gr | |
import requests | |
import bs4 | |
def link_find(url): | |
out = [] | |
source = requests.get(url) | |
if source.status_code ==200: | |
#soup = bs4.BeautifulSoup(source.content,'lxml') | |
soup = bs4.BeautifulSoup(source.content,'html.parser') | |
rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
cnt=0 | |
cnt+=len(rawp) | |
#out.append(rawp) | |
#out.append("HTML fragments: ") | |
q=("a","p","span","content","article") | |
for p in soup.find_all("a"): | |
out.append({"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string,"TREE":[]}) | |
else: | |
pass | |
return out | |
#https://huggingface.co/spaces/Omnibus/crawl | |
def sitemap(url): | |
if url != "" and url != None: | |
link1=link_find(url) | |
for i,ea in enumerate(link1): | |
print(ea) | |
if url.endswith("/"): | |
url=url.strip("/") | |
out_list=link_find(f"{url}{ea['URL']}") | |
link1[i]['TREE']=ea["TREE"].append(out_list) | |
return link1 | |
with gr.Blocks() as app: | |
inp=gr.Textbox() | |
btn=gr.Button() | |
outp=gr.JSON() | |
btn.click(sitemap,inp,outp) | |
app.launch() |