Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -21,10 +21,18 @@ def link_find(url):
|
|
21 |
|
22 |
q=("a","p","span","content","article")
|
23 |
for p in soup.find_all("a"):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
30 |
|
@@ -36,18 +44,19 @@ def link_find(url):
|
|
36 |
|
37 |
def sitemap(url,level):
|
38 |
uri=""
|
|
|
39 |
if url != "" and url != None:
|
40 |
link1,link2=link_find(url)
|
41 |
if level >=2:
|
42 |
for i,ea in enumerate(link1['TREE']):
|
43 |
print(ea)
|
44 |
try:
|
45 |
-
if not ea['URL'].startswith("http"):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
52 |
link1['TREE'][i]=out_list1
|
53 |
link2['TREE'][i]=out_list2
|
@@ -57,12 +66,12 @@ def sitemap(url,level):
|
|
57 |
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
58 |
print(na)
|
59 |
try:
|
60 |
-
if not na['URL'].startswith("http"):
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
67 |
link1['TREE'][i]['TREE'][n]=out_list1
|
68 |
link2['TREE'][i]['TREE'][n]=out_list2
|
|
|
21 |
|
22 |
q=("a","p","span","content","article")
|
23 |
for p in soup.find_all("a"):
|
24 |
+
url0=p.get('href')
|
25 |
+
if not url0.startswith("http"):
|
26 |
+
uri1=url0.split("//")[0]
|
27 |
+
uri2=url0.split("//")[1]
|
28 |
+
uri3=uri2.split("/")[0]
|
29 |
+
uri=f'{uri1}//{uri3}'
|
30 |
+
print(uri)
|
31 |
+
|
32 |
+
node1['LINKS'].append(uri)
|
33 |
+
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
|
34 |
+
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
|
35 |
+
node2['LINKS'].append(uri)
|
36 |
|
37 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
38 |
|
|
|
44 |
|
45 |
def sitemap(url,level):
|
46 |
uri=""
|
47 |
+
uri0=""
|
48 |
if url != "" and url != None:
|
49 |
link1,link2=link_find(url)
|
50 |
if level >=2:
|
51 |
for i,ea in enumerate(link1['TREE']):
|
52 |
print(ea)
|
53 |
try:
|
54 |
+
#if not ea['URL'].startswith("http"):
|
55 |
+
# uri1=url.split("//")[0]
|
56 |
+
# uri2=url.split("//")[1]
|
57 |
+
# uri3=uri2.split("/")[0]
|
58 |
+
# uri=f'{uri1}//{uri3}'
|
59 |
+
# print(uri)
|
60 |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
61 |
link1['TREE'][i]=out_list1
|
62 |
link2['TREE'][i]=out_list2
|
|
|
66 |
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
67 |
print(na)
|
68 |
try:
|
69 |
+
#if not na['URL'].startswith("http"):
|
70 |
+
# uri11=url.split("//")[0]
|
71 |
+
# uri22=url.split("//")[1]
|
72 |
+
# uri33=uri22.split("/")[0]
|
73 |
+
# uri0=f'{uri11}//{uri33}'
|
74 |
+
# print(uri0)
|
75 |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
76 |
link1['TREE'][i]['TREE'][n]=out_list1
|
77 |
link2['TREE'][i]['TREE'][n]=out_list2
|