From ccc905e35191162f6396431b3c55c564cf81b7fe Mon Sep 17 00:00:00 2001 From: madiwka3 Date: Tue, 7 Jul 2020 12:11:45 +0600 Subject: [PATCH] added techradar --- GOOGLE.pyw | 403 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 278 insertions(+), 125 deletions(-) diff --git a/GOOGLE.pyw b/GOOGLE.pyw index 99eef0d..c953fb0 100755 --- a/GOOGLE.pyw +++ b/GOOGLE.pyw @@ -15,10 +15,20 @@ from threading import * import os from tkinter import messagebox desktop = expanduser("~/Documents") +agency = "verge" def chooseDirectory(): currdir = os.getcwd() tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') program.directory = tempdir +def switchAgencies(agencies): + print("called Agencies") + if agencies == "verge": + print("switching to techradar") + agencies = "techradar" + else: + print("switching to verge") + agencies = "verge" + button4['text'] = agencies class Scrapers(object): def __init__(self): self.thread1 = None @@ -63,136 +73,277 @@ class Scrapers(object): self.thread1.start() threadActive = 1 def start_now(self): - progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') - progress['value'] = 0 - progress.pack(side = TOP) - Labels = Label(topFrame, text = "SCRAPING") - Labels.pack(side = TOP) - texts = "change" - main_url = 'https://www.theverge.com/tech' - uClient = uReq(main_url) - page_html = uClient.read() - uClient.close() - page_soup = soup(page_html, "html.parser") - containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) - Articles = len(containers) - filename = self.directory + "/News.txt" - trans_filename = self.directory + "/TranslatedNews.txt" - f = io.open(filename, "w", encoding="utf-8") - f.write("ACTIVE") - t = io.open(trans_filename, "w", encoding ="utf-8") - t.write("ACTIVE") - Labels.config(text = "setting file!") - i = 0 - CurrentTitle = Label(topFrame, text = "Preparing...") - CurrentTitle.pack(side = TOP) - for container in containers: - i = i + 1 - Labels.config(text = "jumping to URL!") - print(container["class"]) - if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: - print("\n WE'VE CATCHED A BUG!") - continue - if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: - print("\n WARNING! THIS IS NOT AN ARTICLE! ") - print(container.div["class"]) - continue - progress['value'] = i * 100 / Articles - local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') - local_progress['value'] = 0 - local_progress.pack(side = BOTTOM) - requiredURL = container.div.a["href"] - secondary_URL = requiredURL - print("Set target URL!") - secClient = uReq(secondary_URL) - news_html = secClient.read() - secClient.close() - news_soup = soup(news_html, "html.parser") - news_soup.decode('utf-8', 'ignore') - news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) - if len(news_containers)>0: - news_title = news_containers[0].h1.text - CurrentTitle.config(text = news_title) - Labels.config(text = "Extracted Title!") - else: - print("ERROR! NO TITLE AT "+secondary_URL) - Labels.config(text = "Failed to extract title") - news_body = news_soup.findAll("div", {"class":"c-entry-content"}) - print("\n TITLE: " + news_title) - f.write("\n \n" + news_title + "\n") - print("Now translating...") - translatedQuery = translate(news_title, "ru", "en") - t.write("\n \n" + translatedQuery + "\n") - paragraphs = news_body[0].findAll("p") - print("Title Recorded!") - local_progress['value'] = 10 - y = len(paragraphs) - x = 0 - fullText = "" - fullText2 = "" - for paragraph in paragraphs: + print("Getting" + button4['text']) + if button4['text'] == "techradar": + progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') + progress['value'] = 0 + progress.pack(side = TOP) + Labels = Label(topFrame, text = "SCRAPING") + Labels.pack(side = TOP) + texts = "change" + main_url = 'https://www.techradar.com/news' + uClient = uReq(main_url) + page_html = uClient.read() + uClient.close() + page_soup = soup(page_html, "html.parser") + containers = page_soup.findAll("div",{"class":"listingResult"}) + + Articles = len(containers) + print(Articles) + filename = self.directory + "/News.txt" + trans_filename = self.directory + "/TranslatedNews.txt" + f = io.open(filename, "w", encoding="utf-8") + f.write("ACTIVE") + t = io.open(trans_filename, "w", encoding ="utf-8") + t.write("ACTIVE") + Labels.config(text = "setting file!") + i = 0 + CurrentTitle = Label(topFrame, text = "Preparing...") + CurrentTitle.pack(side = TOP) + for container in containers: + + + + i = i + 1 + Labels.config(text = "jumping to URL!") + print(container["class"]) + if 'sponsored-post' in container["class"]: + print("\n WE'VE CATCHED AN AD!") + continue + progress['value'] = i * 100 / Articles + local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') + local_progress['value'] = 0 + local_progress.pack(side = BOTTOM) + requiredURL = container.a["href"] + secondary_URL = requiredURL + print("Set target URL!" + requiredURL) + secClient = uReq(secondary_URL) + news_html = secClient.read() + secClient.close() + news_soup = soup(news_html, "html.parser") + news_soup.decode('utf-8', 'ignore') + squash = news_soup.findAll("div",{"class":"icon-plus_circle"}) + print(len(squash)) + if len(squash)>0: + print("\n WARNING! THIS IS NOT AN ARTICLE! ") + print(container.div["class"]) + continue + news_containers = news_soup.findAll("header") + if len(news_containers)>0: + news_title = news_containers[0].h1.text + CurrentTitle.config(text = news_title) + Labels.config(text = "Extracted Title!") + else: + print("ERROR! NO TITLE AT "+secondary_URL) + Labels.config(text = "Failed to extract title") + news_body = news_soup.findAll("div", {"id":"article-body"}) - x = x + 1 - local_progress['value'] = x * 100 / y + 10 - stringx = str(x) - Labels.config(text = "Getting paragraph " + stringx + "...") - print(paragraph.text + "\n \n \n") - if x >= y/2: - fullText2 = fullText2 + paragraph.text.strip() - else: - fullText = fullText + paragraph.text.strip() - Labels.config(text = "Written and Translated Paragraph" + stringx + "!") - print("Writing Paragraph " + stringx + "...") - if self.needToSkip: - break - - if self.needToSkip: - self.needToSkip = False - continue - translatedQuery = translate(fullText, "ru", "en") - completeText = translatedQuery - translatedQuery = translate(fullText2, "ru", "en") - completeText = completeText + translatedQuery - f.write("\n" + fullText + fullText2) - t.write("\n" + completeText) - news_picture = news_soup.findAll("picture", {"class":"c-picture"}) - Labels.config(text = "Getting image...") - if news_picture[0].img != None: - article_pic = news_picture[0].img.get("src") - Labels.config(text = "Picture recieved!") - else: - print("\n THIS ARTICLE HAS NO PICTURE! ") - Labels.config(text = "Failed to locate picture :(") - local_progress['value'] = 120 - f.write("\n PICTURE URL: " + article_pic) - t.write("\n PICTURE URL: " + article_pic) - if self.stop_threads.is_set(): - print("I SURRENDER!") - self.stopped = True + print("\n TITLE: " + news_title) + f.write("\n \n" + news_title + "\n") + print("Now translating...") + translatedQuery = translate(news_title, "ru", "en") + t.write("\n \n" + translatedQuery + "\n") + paragraphs = news_body[0].findAll("p") + print("Title Recorded!") + local_progress['value'] = 10 + y = len(paragraphs) + x = 0 + fullText = "" + fullText2 = "" + for paragraph in paragraphs: + + x = x + 1 + local_progress['value'] = x * 100 / y + 10 + stringx = str(x) + Labels.config(text = "Getting paragraph " + stringx + "...") + print(paragraph.text + "\n \n \n") + if x >= y/2: + fullText2 = fullText2 + paragraph.text.strip() + else: + fullText = fullText + paragraph.text.strip() + Labels.config(text = "Written and Translated Paragraph" + stringx + "!") + print("Writing Paragraph " + stringx + "...") + if self.needToSkip: + break + + if self.needToSkip: + self.needToSkip = False + continue + translatedQuery = translate(fullText, "ru", "en") + completeText = translatedQuery + translatedQuery = translate(fullText2, "ru", "en") + completeText = completeText + translatedQuery + f.write("\n" + fullText + fullText2) + t.write("\n" + completeText) + news_picture = news_soup.findAll("source", {"class":"hero-image"}) + Labels.config(text = "Getting image...") + if len(news_picture) > 0: + article_pic = news_picture[0].get("data-original-mos") + Labels.config(text = "Picture recieved!") + else: + print("\n THIS ARTICLE HAS NO PICTURE! ") + Labels.config(text = "Failed to locate picture :(") + local_progress['value'] = 120 + f.write("\n PICTURE URL: " + article_pic) + t.write("\n PICTURE URL: " + article_pic) + if self.stop_threads.is_set(): + print("I SURRENDER!") + self.stopped = True + f.close() + t.close() + self.CloseLabel.config(text = "you may close now") + sys.exit() + self.CloseLabel.config(text = "I tried, I failed") + break + else: + print("NOTHING IS STOPPING ME!") + Labels.config(text = "Finished the article!") + #brand = divWithInfo.div.a.img["title"] + #title_container = divWithInfo.find("a", "item-title") + #product_name = title_container.text + #shipping_container = divWithInfo.find("li", "price-ship") + #shipping_cost = shipping_container.text.strip() + + #print("brand:"+brand) + #print("name:"+product_name) + #print("shipping:"+shipping_cost) + #print("\n") + + #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") + Labels.config(text = "All Done!") f.close() t.close() - self.CloseLabel.config(text = "you may close now") - sys.exit() - self.CloseLabel.config(text = "I tried, I failed") - break - else: - print("NOTHING IS STOPPING ME!") - Labels.config(text = "Finished the article!") - #brand = divWithInfo.div.a.img["title"] - #title_container = divWithInfo.find("a", "item-title") - #product_name = title_container.text - #shipping_container = divWithInfo.find("li", "price-ship") - #shipping_cost = shipping_container.text.strip() + else: + progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') + progress['value'] = 0 + progress.pack(side = TOP) + Labels = Label(topFrame, text = "SCRAPING") + Labels.pack(side = TOP) + texts = "change" + main_url = 'https://www.theverge.com/tech' + uClient = uReq(main_url) + page_html = uClient.read() + uClient.close() + page_soup = soup(page_html, "html.parser") + containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) + Articles = len(containers) + filename = self.directory + "/News.txt" + trans_filename = self.directory + "/TranslatedNews.txt" + f = io.open(filename, "w", encoding="utf-8") + f.write("ACTIVE") + t = io.open(trans_filename, "w", encoding ="utf-8") + t.write("ACTIVE") + Labels.config(text = "setting file!") + i = 0 + CurrentTitle = Label(topFrame, text = "Preparing...") + CurrentTitle.pack(side = TOP) + for container in containers: + i = i + 1 + Labels.config(text = "jumping to URL!") + print(container["class"]) + if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: + print("\n WE'VE CATCHED A BUG!") + continue + if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: + print("\n WARNING! THIS IS NOT AN ARTICLE! ") + print(container.div["class"]) + continue + progress['value'] = i * 100 / Articles + local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') + local_progress['value'] = 0 + local_progress.pack(side = BOTTOM) + requiredURL = container.div.a["href"] + secondary_URL = requiredURL + print("Set target URL!") + secClient = uReq(secondary_URL) + news_html = secClient.read() + secClient.close() + news_soup = soup(news_html, "html.parser") + news_soup.decode('utf-8', 'ignore') + news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) + if len(news_containers)>0: + news_title = news_containers[0].h1.text + CurrentTitle.config(text = news_title) + Labels.config(text = "Extracted Title!") + else: + print("ERROR! NO TITLE AT "+secondary_URL) + Labels.config(text = "Failed to extract title") + news_body = news_soup.findAll("div", {"class":"c-entry-content"}) + print("\n TITLE: " + news_title) + f.write("\n \n" + news_title + "\n") + print("Now translating...") + translatedQuery = translate(news_title, "ru", "en") + t.write("\n \n" + translatedQuery + "\n") + paragraphs = news_body[0].findAll("p") + print("Title Recorded!") + local_progress['value'] = 10 + y = len(paragraphs) + x = 0 + fullText = "" + fullText2 = "" + for paragraph in paragraphs: - #print("brand:"+brand) - #print("name:"+product_name) - #print("shipping:"+shipping_cost) - #print("\n") + x = x + 1 + local_progress['value'] = x * 100 / y + 10 + stringx = str(x) + Labels.config(text = "Getting paragraph " + stringx + "...") + print(paragraph.text + "\n \n \n") + if x >= y/2: + fullText2 = fullText2 + paragraph.text.strip() + else: + fullText = fullText + paragraph.text.strip() + Labels.config(text = "Written and Translated Paragraph" + stringx + "!") + print("Writing Paragraph " + stringx + "...") + if self.needToSkip: + break + + if self.needToSkip: + self.needToSkip = False + continue + translatedQuery = translate(fullText, "ru", "en") + completeText = translatedQuery + translatedQuery = translate(fullText2, "ru", "en") + completeText = completeText + translatedQuery + f.write("\n" + fullText + fullText2) + t.write("\n" + completeText) + news_picture = news_soup.findAll("picture", {"class":"c-picture"}) + Labels.config(text = "Getting image...") + if news_picture[0].img != None: + article_pic = news_picture[0].img.get("src") + Labels.config(text = "Picture recieved!") + else: + print("\n THIS ARTICLE HAS NO PICTURE! ") + Labels.config(text = "Failed to locate picture :(") + local_progress['value'] = 120 + f.write("\n PICTURE URL: " + article_pic) + t.write("\n PICTURE URL: " + article_pic) + if self.stop_threads.is_set(): + print("I SURRENDER!") + self.stopped = True + f.close() + t.close() + self.CloseLabel.config(text = "you may close now") + sys.exit() + self.CloseLabel.config(text = "I tried, I failed") + break + else: + print("NOTHING IS STOPPING ME!") + Labels.config(text = "Finished the article!") + #brand = divWithInfo.div.a.img["title"] + #title_container = divWithInfo.find("a", "item-title") + #product_name = title_container.text + #shipping_container = divWithInfo.find("li", "price-ship") + #shipping_cost = shipping_container.text.strip() - #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") - Labels.config(text = "All Done!") - f.close() - t.close() + #print("brand:"+brand) + #print("name:"+product_name) + #print("shipping:"+shipping_cost) + #print("\n") + + #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") + Labels.config(text = "All Done!") + f.close() + t.close() texts = "VERGE SCRAPPER" root = Tk() program = Scrapers() @@ -214,9 +365,11 @@ topFrame.pack() bottomFrame = Frame(root) bottomFrame.pack(side=BOTTOM) button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) +button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text'])) button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) button3.pack(side = TOP) button1.pack(side= TOP) +button4.pack(side= TOP) button2.pack(side = TOP) root.mainloop() \ No newline at end of file