Dede16 commited on
Commit
ce05afa
·
1 Parent(s): 97df3ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -91
app.py CHANGED
@@ -96,6 +96,42 @@ def get_azure_response(messages, api_key, azure_api_base):
96
  if finish_reason == 'length' or finish_reason == 'stop':
97
  return response['choices'][0]['message']['content']
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def clean_scrap(artikel,link,models,api_key,azure_api_base):
100
  new_artikel = []
101
  article = []
@@ -170,7 +206,7 @@ def clean_scrap(artikel,link,models,api_key,azure_api_base):
170
 
171
  return title, judul, link, contents
172
 
173
- def scrap_artikel(link_scrap,models,api_key,azure_api_base):
174
  options = webdriver.ChromeOptions()
175
  options.add_argument('--headless')
176
  options.add_argument('--no-sandbox')
@@ -178,35 +214,69 @@ def scrap_artikel(link_scrap,models,api_key,azure_api_base):
178
 
179
  user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
180
  options.add_argument(f"user-agent={user_agent}")
181
-
182
  wd = webdriver.Chrome(options=options)
183
- wd.get(link_scrap)
184
-
185
- wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
186
- time.sleep(1)
187
-
188
- raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
189
- wd.quit()
190
-
191
- soup_html = BeautifulSoup(raw_html, "html.parser")
192
- containers = soup_html.findAll('p')
193
 
194
- artikel =[]
195
- for paragraph in containers:
196
- artic=paragraph.get_text()
197
- artikel.append(artic)
198
-
199
- paragraf = ' '.join(artikel)
200
- if len(paragraf)>= 18000:
201
- part1, part2, part3, part4 = split_article(paragraf)
202
- artikels = [part1, part2, part3, part4]
203
- else :
204
- artikels = [paragraf]
205
- title, judul, url, contents = clean_scrap(artikels,link_scrap,models,api_key,azure_api_base)
206
- return title, judul, url, contents
207
-
208
- def artikel_processing(link_scrap,backlink,keyword,models,api_key,azure_api_base,replicate_key):
209
- title, judul, url, artikel= scrap_artikel(link_scrap, models, api_key,azure_api_base)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  teks_to_tags = artikel[0][:500]
211
  translated = []
212
  optimized = []
@@ -447,69 +517,87 @@ def artikel_processing(link_scrap,backlink,keyword,models,api_key,azure_api_base
447
  base64_string = base64.b64decode(base64_string)
448
  image_data= base64_string
449
  os.remove(tmp_path)
450
- return judul,content,image,image_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
- def scrap(link_scrap,backlink,keyword,version,api_key,azure_api_base,replicate_key):
453
- try:
454
- judul,kontent,gambar, image_data= artikel_processing(link_scrap,backlink,keyword,version,api_key,azure_api_base,replicate_key)
455
- title = '<h1>'+judul+'</h1>'
456
- desired_timezone = pytz.timezone('Asia/Jakarta')
457
- current_time = datetime.datetime.now(desired_timezone)
458
- Timestamp = current_time.strftime('%Y-%m-%d %H:%M:%S')
459
-
460
- with open('log_activity.txt', 'r') as file:
461
- existing_data = file.read()
462
-
463
- log = 'Generated_Title:' + judul + '\nTimestamp:' + Timestamp + '\nLink:' + link_scrap + "\n\n"
464
-
465
- combined_data = existing_data + log
466
-
467
- with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
468
- temp_file.write(combined_data)
469
-
470
- with open("judul.txt", "w") as file:
471
- file.write(judul)
472
-
473
- with open("kontent.txt", "w") as file:
474
- file.write(kontent)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
- repo_name = get_full_repo_name(model_id="Article_Gen3", token="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")
477
- file_url = upload_file(
478
- path_or_fileobj=temp_file.name, # Use the temporary file name
479
- path_in_repo="log_activity.txt",
480
- repo_id=repo_name,
481
- repo_type="space",
482
- token="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",
483
- )
484
- file_url = upload_file(
485
- path_or_fileobj=image_data,
486
- path_in_repo="image_data.txt",
487
- repo_id=repo_name,
488
- repo_type="space",
489
- token="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",
490
- )
491
- file_url = upload_file(
492
- path_or_fileobj='judul.txt',
493
- path_in_repo="judul.txt",
494
- repo_id=repo_name,
495
- repo_type="space",
496
- token="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",
497
- )
498
- file_url = upload_file(
499
- path_or_fileobj='kontent.txt',
500
- path_in_repo="kontent.txt",
501
- repo_id=repo_name,
502
- repo_type="space",
503
- token="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",
504
- )
505
- if kontent:
506
- status = "<h3>Berhasil Generate Artikel</h3>"
507
- time.sleep(60)
508
- return status,gambar
509
- except:
510
- status = "<h3>Gagal Generate Artikel</h3>"
511
- gambar = Image.open('error.png')
512
- return status,gamber
513
 
514
 
515
  def post(endpoint,endpoint_media,username,password,tags,categories,metode):
@@ -614,7 +702,8 @@ with gr.Blocks(theme = "soft", title="Wordpress Article Generator") as article_g
614
  """)
615
  with gr.Row():
616
  with gr.Column():
617
- link = gr.Textbox(placeholder="Masukkan Link Artikel Yang Akan di Scrap", label="Link")
 
618
  backlink = gr.Textbox(placeholder="Masukkan Backlink Yang Akan Diterapkan", label="Backlink")
619
  keyword = gr.Textbox(placeholder="Masukkan Keyword Artikel", label="Keyword")
620
  versi = gr.Radio(["openai", "azure"], label="Request Schema", info="Pilih Skema Untuk Request ke ChatGPT ")
@@ -624,7 +713,7 @@ with gr.Blocks(theme = "soft", title="Wordpress Article Generator") as article_g
624
  button_scrap = gr.Button("Scrap Article")
625
  output = gr.HTML("")
626
  img = gr.Image(label="Content Media")
627
- button_scrap.click(fn=scrap, inputs=[link,backlink,keyword,versi,api_key,link_azure,replicate_token], outputs= [output,img])
628
  view_outputs = gr.Button("View Article")
629
  with gr.Tab("Raw Article"):
630
  title = gr.Textbox("", label="Title", interactive=True)
 
96
  if finish_reason == 'length' or finish_reason == 'stop':
97
  return response['choices'][0]['message']['content']
98
 
99
+ def cek_url(url):
100
+ if not os.path.exists("log_url.txt"):
101
+ with open("log_url.txt", 'w') as file:
102
+ pass
103
+
104
+ with open("log_url.txt", 'r') as file:
105
+ scraped_urls = set(url.strip() for url in file.readlines())
106
+
107
+ if url in scraped_urls:
108
+ return True
109
+ else:
110
+ scraped_urls.add(url)
111
+ return False
112
+
113
+ def scrap_portal(query):
114
+ api_key = 'AIzaSyDJUWVZG2oHkHSsYoqdqgUZwQC2Aa2kSok'
115
+ search_engine_id = 'a0dc878459ceb4811'
116
+ num_pages = 5
117
+ link = []
118
+
119
+ for page in range(num_pages):
120
+ start_index = page * 10 + 1
121
+ url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&start={start_index}'
122
+ response = requests.get(url)
123
+ if response.status_code == 200:
124
+ data = response.json()
125
+
126
+ for item in data['items']:
127
+ url = item['link']
128
+ link.append(url)
129
+ else:
130
+ print(f"Permintaan halaman {page + 1} gagal. Kode status:", response.status_code)
131
+ filter_link1 = [url for url in link if "categories" not in url and "tags" not in url]
132
+ filter_link2 = [url for url in filter_link1 if "help" not in url]
133
+ return filter_link2
134
+
135
  def clean_scrap(artikel,link,models,api_key,azure_api_base):
136
  new_artikel = []
137
  article = []
 
206
 
207
  return title, judul, link, contents
208
 
209
+ def scrap_artikel(source_type,source,models,api_key,azure_api_base):
210
  options = webdriver.ChromeOptions()
211
  options.add_argument('--headless')
212
  options.add_argument('--no-sandbox')
 
214
 
215
  user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
216
  options.add_argument(f"user-agent={user_agent}")
 
217
  wd = webdriver.Chrome(options=options)
 
 
 
 
 
 
 
 
 
 
218
 
219
+
220
+ if source_type == "input topic":
221
+ link = scrap_portal(source)
222
+ for url in link:
223
+ if cek_url(url):
224
+ continue
225
+ else:
226
+ if len(artikel) >=1:
227
+ continue
228
+ wd.get(url)
229
+ wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
230
+ time.sleep(1)
231
+
232
+ raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
233
+ wd.quit()
234
+
235
+ soup_html = BeautifulSoup(raw_html, "html.parser")
236
+ containers = soup_html.findAll('p')
237
+
238
+ artikel =[]
239
+ for paragraph in containers:
240
+ artic=paragraph.get_text()
241
+ artikel.append(artic)
242
+
243
+ paragraf = ' '.join(artikel)
244
+ if len(paragraf)>= 18000:
245
+ part1, part2, part3, part4 = split_article(paragraf)
246
+ artikels = [part1, part2, part3, part4]
247
+ else :
248
+ artikels = [paragraf]
249
+ title, judul, url, contents = clean_scrap(artikels,url,models,api_key,azure_api_base)
250
+ return title, judul, url, contents
251
+
252
+ else:
253
+ wd.get(source)
254
+
255
+ wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
256
+ time.sleep(1)
257
+
258
+ raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
259
+ wd.quit()
260
+
261
+ soup_html = BeautifulSoup(raw_html, "html.parser")
262
+ containers = soup_html.findAll('p')
263
+
264
+ artikel =[]
265
+ for paragraph in containers:
266
+ artic=paragraph.get_text()
267
+ artikel.append(artic)
268
+
269
+ paragraf = ' '.join(artikel)
270
+ if len(paragraf)>= 18000:
271
+ part1, part2, part3, part4 = split_article(paragraf)
272
+ artikels = [part1, part2, part3, part4]
273
+ else :
274
+ artikels = [paragraf]
275
+ title, judul, url, contents = clean_scrap(artikels,source,models,api_key,azure_api_base)
276
+ return title, judul, url, contents
277
+
278
+ def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
279
+ title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base)
280
  teks_to_tags = artikel[0][:500]
281
  translated = []
282
  optimized = []
 
517
  base64_string = base64.b64decode(base64_string)
518
  image_data= base64_string
519
  os.remove(tmp_path)
520
+ return judul,content,image,image_data,url
521
+
522
+ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
523
+ # try:
524
+ judul,kontent,gambar, image_data,url= artikel_processing(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key)
525
+ title = '<h1>'+judul+'</h1>'
526
+ desired_timezone = pytz.timezone('Asia/Jakarta')
527
+ current_time = datetime.datetime.now(desired_timezone)
528
+ Timestamp = current_time.strftime('%Y-%m-%d %H:%M:%S')
529
+
530
+ with open('log_activity.txt', 'r') as file:
531
+ existing_data = file.read()
532
+
533
+ log = 'Source:' + url + '\nGenerated_Title:' + judul + '\nTimestamp:' + Timestamp + "\n\n"
534
+
535
+ combined_data = existing_data + log
536
+
537
+ with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
538
+ temp_file.write(combined_data)
539
 
540
+ with open('log_url.txt', 'r') as file:
541
+ existing_data = file.read()
542
+
543
+ log = 'Source:' + url + "\n"
544
+ combined_data = existing_data + log
545
+
546
+ with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
547
+ temp_file.write(combined_data)
548
+
549
+ with open("judul.txt", "w") as file:
550
+ file.write(judul)
551
+
552
+ with open("kontent.txt", "w") as file:
553
+ file.write(kontent)
554
+
555
+ repo_name = get_full_repo_name(model_id="Article_Gen3", token="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")
556
+ file_url = upload_file(
557
+ path_or_fileobj=temp_file.name, # Use the temporary file name
558
+ path_in_repo="log_activity.txt",
559
+ repo_id=repo_name,
560
+ repo_type="space",
561
+ token="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",
562
+ )
563
+ file_url = upload_file(
564
+ path_or_fileobj=image_data,
565
+ path_in_repo="image_data.txt",
566
+ repo_id=repo_name,
567
+ repo_type="space",
568
+ token="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",
569
+ )
570
+ file_url = upload_file(
571
+ path_or_fileobj='judul.txt',
572
+ path_in_repo="judul.txt",
573
+ repo_id=repo_name,
574
+ repo_type="space",
575
+ token="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",
576
+ )
577
+ file_url = upload_file(
578
+ path_or_fileobj='kontent.txt',
579
+ path_in_repo="kontent.txt",
580
+ repo_id=repo_name,
581
+ repo_type="space",
582
+ token="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",
583
+ )
584
+ if kontent:
585
+ status = "<h3>Berhasil Generate Artikel</h3>"
586
+ time.sleep(60)
587
+ return status,gambar
588
+ # except:
589
+ # with open('log_url.txt', 'r') as file:
590
+ # existing_data = file.read()
591
+
592
+ # log = 'Source:' + source + "\n"
593
+ # combined_data = existing_data + log
594
+
595
+ # with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
596
+ # temp_file.write(combined_data)
597
 
598
+ # status = "<h3>Gagal Generate Artikel</h3>"
599
+ # gambar = Image.open('error.png')
600
+ # return status,gamber
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
 
603
  def post(endpoint,endpoint_media,username,password,tags,categories,metode):
 
702
  """)
703
  with gr.Row():
704
  with gr.Column():
705
+ source_type = gr.Radio(["input link", "input topic"], label="Source")
706
+ source = gr.Textbox(placeholder="Masukkan Source Link/Topik Artikel Yang Akan Digenerate", show_label=False)
707
  backlink = gr.Textbox(placeholder="Masukkan Backlink Yang Akan Diterapkan", label="Backlink")
708
  keyword = gr.Textbox(placeholder="Masukkan Keyword Artikel", label="Keyword")
709
  versi = gr.Radio(["openai", "azure"], label="Request Schema", info="Pilih Skema Untuk Request ke ChatGPT ")
 
713
  button_scrap = gr.Button("Scrap Article")
714
  output = gr.HTML("")
715
  img = gr.Image(label="Content Media")
716
+ button_scrap.click(fn=scrap, inputs=[source_type,source,backlink,keyword,versi,api_key,link_azure,replicate_token], outputs= [output,img])
717
  view_outputs = gr.Button("View Article")
718
  with gr.Tab("Raw Article"):
719
  title = gr.Textbox("", label="Title", interactive=True)