Browse Source

added techradar

master
madiwka3 4 years ago
parent
commit
ccc905e351
1 changed files with 278 additions and 125 deletions
  1. +278
    -125
      GOOGLE.pyw

+ 278
- 125
GOOGLE.pyw View File

@@ -15,10 +15,20 @@ from threading import *
import os import os
from tkinter import messagebox from tkinter import messagebox
desktop = expanduser("~/Documents") desktop = expanduser("~/Documents")
agency = "verge"
def chooseDirectory(): def chooseDirectory():
currdir = os.getcwd() currdir = os.getcwd()
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
program.directory = tempdir program.directory = tempdir
def switchAgencies(agencies):
print("called Agencies")
if agencies == "verge":
print("switching to techradar")
agencies = "techradar"
else:
print("switching to verge")
agencies = "verge"
button4['text'] = agencies
class Scrapers(object): class Scrapers(object):
def __init__(self): def __init__(self):
self.thread1 = None self.thread1 = None
@@ -63,136 +73,277 @@ class Scrapers(object):
self.thread1.start() self.thread1.start()
threadActive = 1 threadActive = 1
def start_now(self): def start_now(self):
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') print("Getting" + button4['text'])
progress['value'] = 0 if button4['text'] == "techradar":
progress.pack(side = TOP) progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
Labels = Label(topFrame, text = "SCRAPING") progress['value'] = 0
Labels.pack(side = TOP) progress.pack(side = TOP)
texts = "change" Labels = Label(topFrame, text = "SCRAPING")
main_url = 'https://www.theverge.com/tech' Labels.pack(side = TOP)
uClient = uReq(main_url) texts = "change"
page_html = uClient.read() main_url = 'https://www.techradar.com/news'
uClient.close() uClient = uReq(main_url)
page_soup = soup(page_html, "html.parser") page_html = uClient.read()
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) uClient.close()
Articles = len(containers) page_soup = soup(page_html, "html.parser")
filename = self.directory + "/News.txt" containers = page_soup.findAll("div",{"class":"listingResult"})
trans_filename = self.directory + "/TranslatedNews.txt" Articles = len(containers)
f = io.open(filename, "w", encoding="utf-8") print(Articles)
f.write("ACTIVE") filename = self.directory + "/News.txt"
t = io.open(trans_filename, "w", encoding ="utf-8") trans_filename = self.directory + "/TranslatedNews.txt"
t.write("ACTIVE") f = io.open(filename, "w", encoding="utf-8")
Labels.config(text = "setting file!") f.write("ACTIVE")
i = 0 t = io.open(trans_filename, "w", encoding ="utf-8")
CurrentTitle = Label(topFrame, text = "Preparing...") t.write("ACTIVE")
CurrentTitle.pack(side = TOP) Labels.config(text = "setting file!")
for container in containers: i = 0
i = i + 1 CurrentTitle = Label(topFrame, text = "Preparing...")
Labels.config(text = "jumping to URL!") CurrentTitle.pack(side = TOP)
print(container["class"]) for container in containers:
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: i = i + 1
print("\n WE'VE CATCHED A BUG!") Labels.config(text = "jumping to URL!")
continue print(container["class"])
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: if 'sponsored-post' in container["class"]:
print("\n WARNING! THIS IS NOT AN ARTICLE! ") print("\n WE'VE CATCHED AN AD!")
print(container.div["class"]) continue
continue progress['value'] = i * 100 / Articles
progress['value'] = i * 100 / Articles local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') local_progress['value'] = 0
local_progress['value'] = 0 local_progress.pack(side = BOTTOM)
local_progress.pack(side = BOTTOM) requiredURL = container.a["href"]
requiredURL = container.div.a["href"] secondary_URL = requiredURL
secondary_URL = requiredURL print("Set target URL!" + requiredURL)
print("Set target URL!") secClient = uReq(secondary_URL)
secClient = uReq(secondary_URL) news_html = secClient.read()
news_html = secClient.read() secClient.close()
secClient.close() news_soup = soup(news_html, "html.parser")
news_soup = soup(news_html, "html.parser") news_soup.decode('utf-8', 'ignore')
news_soup.decode('utf-8', 'ignore') squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) print(len(squash))
if len(news_containers)>0: if len(squash)>0:
news_title = news_containers[0].h1.text print("\n WARNING! THIS IS NOT AN ARTICLE! ")
CurrentTitle.config(text = news_title) print(container.div["class"])
Labels.config(text = "Extracted Title!") continue
else: news_containers = news_soup.findAll("header")
print("ERROR! NO TITLE AT "+secondary_URL) if len(news_containers)>0:
Labels.config(text = "Failed to extract title") news_title = news_containers[0].h1.text
news_body = news_soup.findAll("div", {"class":"c-entry-content"}) CurrentTitle.config(text = news_title)
print("\n TITLE: " + news_title) Labels.config(text = "Extracted Title!")
f.write("\n \n" + news_title + "\n") else:
print("Now translating...") print("ERROR! NO TITLE AT "+secondary_URL)
translatedQuery = translate(news_title, "ru", "en") Labels.config(text = "Failed to extract title")
t.write("\n \n" + translatedQuery + "\n") news_body = news_soup.findAll("div", {"id":"article-body"})
paragraphs = news_body[0].findAll("p")
print("Title Recorded!")
local_progress['value'] = 10
y = len(paragraphs)
x = 0
fullText = ""
fullText2 = ""
for paragraph in paragraphs:
x = x + 1 print("\n TITLE: " + news_title)
local_progress['value'] = x * 100 / y + 10 f.write("\n \n" + news_title + "\n")
stringx = str(x) print("Now translating...")
Labels.config(text = "Getting paragraph " + stringx + "...") translatedQuery = translate(news_title, "ru", "en")
print(paragraph.text + "\n \n \n") t.write("\n \n" + translatedQuery + "\n")
if x >= y/2: paragraphs = news_body[0].findAll("p")
fullText2 = fullText2 + paragraph.text.strip() print("Title Recorded!")
else: local_progress['value'] = 10
fullText = fullText + paragraph.text.strip() y = len(paragraphs)
Labels.config(text = "Written and Translated Paragraph" + stringx + "!") x = 0
print("Writing Paragraph " + stringx + "...") fullText = ""
if self.needToSkip: fullText2 = ""
break for paragraph in paragraphs:
x = x + 1
if self.needToSkip: local_progress['value'] = x * 100 / y + 10
self.needToSkip = False stringx = str(x)
continue Labels.config(text = "Getting paragraph " + stringx + "...")
translatedQuery = translate(fullText, "ru", "en") print(paragraph.text + "\n \n \n")
completeText = translatedQuery if x >= y/2:
translatedQuery = translate(fullText2, "ru", "en") fullText2 = fullText2 + paragraph.text.strip()
completeText = completeText + translatedQuery else:
f.write("\n" + fullText + fullText2) fullText = fullText + paragraph.text.strip()
t.write("\n" + completeText) Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
news_picture = news_soup.findAll("picture", {"class":"c-picture"}) print("Writing Paragraph " + stringx + "...")
Labels.config(text = "Getting image...") if self.needToSkip:
if news_picture[0].img != None: break
article_pic = news_picture[0].img.get("src") if self.needToSkip:
Labels.config(text = "Picture recieved!") self.needToSkip = False
else: continue
print("\n THIS ARTICLE HAS NO PICTURE! ") translatedQuery = translate(fullText, "ru", "en")
Labels.config(text = "Failed to locate picture :(") completeText = translatedQuery
local_progress['value'] = 120 translatedQuery = translate(fullText2, "ru", "en")
f.write("\n PICTURE URL: " + article_pic) completeText = completeText + translatedQuery
t.write("\n PICTURE URL: " + article_pic) f.write("\n" + fullText + fullText2)
if self.stop_threads.is_set(): t.write("\n" + completeText)
print("I SURRENDER!") news_picture = news_soup.findAll("source", {"class":"hero-image"})
self.stopped = True Labels.config(text = "Getting image...")
if len(news_picture) > 0:
article_pic = news_picture[0].get("data-original-mos")
Labels.config(text = "Picture recieved!")
else:
print("\n THIS ARTICLE HAS NO PICTURE! ")
Labels.config(text = "Failed to locate picture :(")
local_progress['value'] = 120
f.write("\n PICTURE URL: " + article_pic)
t.write("\n PICTURE URL: " + article_pic)
if self.stop_threads.is_set():
print("I SURRENDER!")
self.stopped = True
f.close()
t.close()
self.CloseLabel.config(text = "you may close now")
sys.exit()
self.CloseLabel.config(text = "I tried, I failed")
break
else:
print("NOTHING IS STOPPING ME!")
Labels.config(text = "Finished the article!")
#brand = divWithInfo.div.a.img["title"]
#title_container = divWithInfo.find("a", "item-title")
#product_name = title_container.text
#shipping_container = divWithInfo.find("li", "price-ship")
#shipping_cost = shipping_container.text.strip()
#print("brand:"+brand)
#print("name:"+product_name)
#print("shipping:"+shipping_cost)
#print("\n")
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
Labels.config(text = "All Done!")
f.close() f.close()
t.close() t.close()
self.CloseLabel.config(text = "you may close now") else:
sys.exit() progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
self.CloseLabel.config(text = "I tried, I failed") progress['value'] = 0
break progress.pack(side = TOP)
else: Labels = Label(topFrame, text = "SCRAPING")
print("NOTHING IS STOPPING ME!") Labels.pack(side = TOP)
Labels.config(text = "Finished the article!") texts = "change"
#brand = divWithInfo.div.a.img["title"] main_url = 'https://www.theverge.com/tech'
#title_container = divWithInfo.find("a", "item-title") uClient = uReq(main_url)
#product_name = title_container.text page_html = uClient.read()
#shipping_container = divWithInfo.find("li", "price-ship") uClient.close()
#shipping_cost = shipping_container.text.strip() page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
Articles = len(containers)
filename = self.directory + "/News.txt"
trans_filename = self.directory + "/TranslatedNews.txt"
f = io.open(filename, "w", encoding="utf-8")
f.write("ACTIVE")
t = io.open(trans_filename, "w", encoding ="utf-8")
t.write("ACTIVE")
Labels.config(text = "setting file!")
i = 0
CurrentTitle = Label(topFrame, text = "Preparing...")
CurrentTitle.pack(side = TOP)
for container in containers:
i = i + 1
Labels.config(text = "jumping to URL!")
print(container["class"])
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
print("\n WE'VE CATCHED A BUG!")
continue
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
print(container.div["class"])
continue
progress['value'] = i * 100 / Articles
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
local_progress['value'] = 0
local_progress.pack(side = BOTTOM)
requiredURL = container.div.a["href"]
secondary_URL = requiredURL
print("Set target URL!")
secClient = uReq(secondary_URL)
news_html = secClient.read()
secClient.close()
news_soup = soup(news_html, "html.parser")
news_soup.decode('utf-8', 'ignore')
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
if len(news_containers)>0:
news_title = news_containers[0].h1.text
CurrentTitle.config(text = news_title)
Labels.config(text = "Extracted Title!")
else:
print("ERROR! NO TITLE AT "+secondary_URL)
Labels.config(text = "Failed to extract title")
news_body = news_soup.findAll("div", {"class":"c-entry-content"})
print("\n TITLE: " + news_title)
f.write("\n \n" + news_title + "\n")
print("Now translating...")
translatedQuery = translate(news_title, "ru", "en")
t.write("\n \n" + translatedQuery + "\n")
paragraphs = news_body[0].findAll("p")
print("Title Recorded!")
local_progress['value'] = 10
y = len(paragraphs)
x = 0
fullText = ""
fullText2 = ""
for paragraph in paragraphs:
#print("brand:"+brand) x = x + 1
#print("name:"+product_name) local_progress['value'] = x * 100 / y + 10
#print("shipping:"+shipping_cost) stringx = str(x)
#print("\n") Labels.config(text = "Getting paragraph " + stringx + "...")
print(paragraph.text + "\n \n \n")
if x >= y/2:
fullText2 = fullText2 + paragraph.text.strip()
else:
fullText = fullText + paragraph.text.strip()
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
print("Writing Paragraph " + stringx + "...")
if self.needToSkip:
break
if self.needToSkip:
self.needToSkip = False
continue
translatedQuery = translate(fullText, "ru", "en")
completeText = translatedQuery
translatedQuery = translate(fullText2, "ru", "en")
completeText = completeText + translatedQuery
f.write("\n" + fullText + fullText2)
t.write("\n" + completeText)
news_picture = news_soup.findAll("picture", {"class":"c-picture"})
Labels.config(text = "Getting image...")
if news_picture[0].img != None:
article_pic = news_picture[0].img.get("src")
Labels.config(text = "Picture recieved!")
else:
print("\n THIS ARTICLE HAS NO PICTURE! ")
Labels.config(text = "Failed to locate picture :(")
local_progress['value'] = 120
f.write("\n PICTURE URL: " + article_pic)
t.write("\n PICTURE URL: " + article_pic)
if self.stop_threads.is_set():
print("I SURRENDER!")
self.stopped = True
f.close()
t.close()
self.CloseLabel.config(text = "you may close now")
sys.exit()
self.CloseLabel.config(text = "I tried, I failed")
break
else:
print("NOTHING IS STOPPING ME!")
Labels.config(text = "Finished the article!")
#brand = divWithInfo.div.a.img["title"]
#title_container = divWithInfo.find("a", "item-title")
#product_name = title_container.text
#shipping_container = divWithInfo.find("li", "price-ship")
#shipping_cost = shipping_container.text.strip()
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") #print("brand:"+brand)
Labels.config(text = "All Done!") #print("name:"+product_name)
f.close() #print("shipping:"+shipping_cost)
t.close() #print("\n")
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
Labels.config(text = "All Done!")
f.close()
t.close()
texts = "VERGE SCRAPPER" texts = "VERGE SCRAPPER"
root = Tk() root = Tk()
program = Scrapers() program = Scrapers()
@@ -214,9 +365,11 @@ topFrame.pack()
bottomFrame = Frame(root) bottomFrame = Frame(root)
bottomFrame.pack(side=BOTTOM) bottomFrame.pack(side=BOTTOM)
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
button3.pack(side = TOP) button3.pack(side = TOP)
button1.pack(side= TOP) button1.pack(side= TOP)
button4.pack(side= TOP)
button2.pack(side = TOP) button2.pack(side = TOP)
root.mainloop() root.mainloop()

||||||
x
 
000:0
Loading…
Cancel
Save