Spaces:
Build error
Build error
File size: 4,730 Bytes
ae4e988 437cf54 1b682b7 437cf54 573c6d4 75dab34 1b682b7 a57fdc7 827c354 437cf54 827c354 a57fdc7 e30ed28 437cf54 1b682b7 62f0b09 a57fdc7 c30642b 437cf54 f0e1870 b962252 6660108 a57fdc7 f0e1870 3b1a5cc f0e1870 a57fdc7 6c531ab f0e1870 3b1a5cc f0e1870 a57fdc7 f0e1870 a57fdc7 3b1a5cc ae4e988 3b44741 a57fdc7 5e66ae2 a57fdc7 ae4e988 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
import requests
import bs4
def link_find(url):
out = []
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"TREE":[]}
node2 = {"URL":url,"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
node1['TREE'].append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
node2['TREE'].append({"URL":p.get('href'),"TREE":[]})
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
else:
print("NO")
pass
return node1,node2
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap(url,level):
uri=""
if url != "" and url != None:
link1,link2=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1,link2
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
with gr.Blocks() as app:
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
btn=gr.Button()
outp=gr.JSON()
with gr.Column(scale=1):
outmap=gr.JSON()
btn.click(sitemap,[inp,level],[outp,outmap])
app.launch() |