| @@ -15,10 +15,20 @@ from threading import * | |||
| import os | |||
| from tkinter import messagebox | |||
| desktop = expanduser("~/Documents") | |||
| agency = "verge" | |||
| def chooseDirectory(): | |||
| currdir = os.getcwd() | |||
| tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') | |||
| program.directory = tempdir | |||
| def switchAgencies(agencies): | |||
| print("called Agencies") | |||
| if agencies == "verge": | |||
| print("switching to techradar") | |||
| agencies = "techradar" | |||
| else: | |||
| print("switching to verge") | |||
| agencies = "verge" | |||
| button4['text'] = agencies | |||
| class Scrapers(object): | |||
| def __init__(self): | |||
| self.thread1 = None | |||
| @@ -63,136 +73,277 @@ class Scrapers(object): | |||
| self.thread1.start() | |||
| threadActive = 1 | |||
| def start_now(self): | |||
| progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||
| progress['value'] = 0 | |||
| progress.pack(side = TOP) | |||
| Labels = Label(topFrame, text = "SCRAPING") | |||
| Labels.pack(side = TOP) | |||
| texts = "change" | |||
| main_url = 'https://www.theverge.com/tech' | |||
| uClient = uReq(main_url) | |||
| page_html = uClient.read() | |||
| uClient.close() | |||
| page_soup = soup(page_html, "html.parser") | |||
| containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||
| Articles = len(containers) | |||
| filename = self.directory + "/News.txt" | |||
| trans_filename = self.directory + "/TranslatedNews.txt" | |||
| f = io.open(filename, "w", encoding="utf-8") | |||
| f.write("ACTIVE") | |||
| t = io.open(trans_filename, "w", encoding ="utf-8") | |||
| t.write("ACTIVE") | |||
| Labels.config(text = "setting file!") | |||
| i = 0 | |||
| CurrentTitle = Label(topFrame, text = "Preparing...") | |||
| CurrentTitle.pack(side = TOP) | |||
| for container in containers: | |||
| i = i + 1 | |||
| Labels.config(text = "jumping to URL!") | |||
| print(container["class"]) | |||
| if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||
| print("\n WE'VE CATCHED A BUG!") | |||
| continue | |||
| if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||
| print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||
| print(container.div["class"]) | |||
| continue | |||
| progress['value'] = i * 100 / Articles | |||
| local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||
| local_progress['value'] = 0 | |||
| local_progress.pack(side = BOTTOM) | |||
| requiredURL = container.div.a["href"] | |||
| secondary_URL = requiredURL | |||
| print("Set target URL!") | |||
| secClient = uReq(secondary_URL) | |||
| news_html = secClient.read() | |||
| secClient.close() | |||
| news_soup = soup(news_html, "html.parser") | |||
| news_soup.decode('utf-8', 'ignore') | |||
| news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||
| if len(news_containers)>0: | |||
| news_title = news_containers[0].h1.text | |||
| CurrentTitle.config(text = news_title) | |||
| Labels.config(text = "Extracted Title!") | |||
| else: | |||
| print("ERROR! NO TITLE AT "+secondary_URL) | |||
| Labels.config(text = "Failed to extract title") | |||
| news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||
| print("\n TITLE: " + news_title) | |||
| f.write("\n \n" + news_title + "\n") | |||
| print("Now translating...") | |||
| translatedQuery = translate(news_title, "ru", "en") | |||
| t.write("\n \n" + translatedQuery + "\n") | |||
| paragraphs = news_body[0].findAll("p") | |||
| print("Title Recorded!") | |||
| local_progress['value'] = 10 | |||
| y = len(paragraphs) | |||
| x = 0 | |||
| fullText = "" | |||
| fullText2 = "" | |||
| for paragraph in paragraphs: | |||
| print("Getting" + button4['text']) | |||
| if button4['text'] == "techradar": | |||
| progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||
| progress['value'] = 0 | |||
| progress.pack(side = TOP) | |||
| Labels = Label(topFrame, text = "SCRAPING") | |||
| Labels.pack(side = TOP) | |||
| texts = "change" | |||
| main_url = 'https://www.techradar.com/news' | |||
| uClient = uReq(main_url) | |||
| page_html = uClient.read() | |||
| uClient.close() | |||
| page_soup = soup(page_html, "html.parser") | |||
| containers = page_soup.findAll("div",{"class":"listingResult"}) | |||
| Articles = len(containers) | |||
| print(Articles) | |||
| filename = self.directory + "/News.txt" | |||
| trans_filename = self.directory + "/TranslatedNews.txt" | |||
| f = io.open(filename, "w", encoding="utf-8") | |||
| f.write("ACTIVE") | |||
| t = io.open(trans_filename, "w", encoding ="utf-8") | |||
| t.write("ACTIVE") | |||
| Labels.config(text = "setting file!") | |||
| i = 0 | |||
| CurrentTitle = Label(topFrame, text = "Preparing...") | |||
| CurrentTitle.pack(side = TOP) | |||
| for container in containers: | |||
| i = i + 1 | |||
| Labels.config(text = "jumping to URL!") | |||
| print(container["class"]) | |||
| if 'sponsored-post' in container["class"]: | |||
| print("\n WE'VE CATCHED AN AD!") | |||
| continue | |||
| progress['value'] = i * 100 / Articles | |||
| local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||
| local_progress['value'] = 0 | |||
| local_progress.pack(side = BOTTOM) | |||
| requiredURL = container.a["href"] | |||
| secondary_URL = requiredURL | |||
| print("Set target URL!" + requiredURL) | |||
| secClient = uReq(secondary_URL) | |||
| news_html = secClient.read() | |||
| secClient.close() | |||
| news_soup = soup(news_html, "html.parser") | |||
| news_soup.decode('utf-8', 'ignore') | |||
| squash = news_soup.findAll("div",{"class":"icon-plus_circle"}) | |||
| print(len(squash)) | |||
| if len(squash)>0: | |||
| print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||
| print(container.div["class"]) | |||
| continue | |||
| news_containers = news_soup.findAll("header") | |||
| if len(news_containers)>0: | |||
| news_title = news_containers[0].h1.text | |||
| CurrentTitle.config(text = news_title) | |||
| Labels.config(text = "Extracted Title!") | |||
| else: | |||
| print("ERROR! NO TITLE AT "+secondary_URL) | |||
| Labels.config(text = "Failed to extract title") | |||
| news_body = news_soup.findAll("div", {"id":"article-body"}) | |||
| x = x + 1 | |||
| local_progress['value'] = x * 100 / y + 10 | |||
| stringx = str(x) | |||
| Labels.config(text = "Getting paragraph " + stringx + "...") | |||
| print(paragraph.text + "\n \n \n") | |||
| if x >= y/2: | |||
| fullText2 = fullText2 + paragraph.text.strip() | |||
| else: | |||
| fullText = fullText + paragraph.text.strip() | |||
| Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||
| print("Writing Paragraph " + stringx + "...") | |||
| if self.needToSkip: | |||
| break | |||
| if self.needToSkip: | |||
| self.needToSkip = False | |||
| continue | |||
| translatedQuery = translate(fullText, "ru", "en") | |||
| completeText = translatedQuery | |||
| translatedQuery = translate(fullText2, "ru", "en") | |||
| completeText = completeText + translatedQuery | |||
| f.write("\n" + fullText + fullText2) | |||
| t.write("\n" + completeText) | |||
| news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||
| Labels.config(text = "Getting image...") | |||
| if news_picture[0].img != None: | |||
| article_pic = news_picture[0].img.get("src") | |||
| Labels.config(text = "Picture recieved!") | |||
| else: | |||
| print("\n THIS ARTICLE HAS NO PICTURE! ") | |||
| Labels.config(text = "Failed to locate picture :(") | |||
| local_progress['value'] = 120 | |||
| f.write("\n PICTURE URL: " + article_pic) | |||
| t.write("\n PICTURE URL: " + article_pic) | |||
| if self.stop_threads.is_set(): | |||
| print("I SURRENDER!") | |||
| self.stopped = True | |||
| print("\n TITLE: " + news_title) | |||
| f.write("\n \n" + news_title + "\n") | |||
| print("Now translating...") | |||
| translatedQuery = translate(news_title, "ru", "en") | |||
| t.write("\n \n" + translatedQuery + "\n") | |||
| paragraphs = news_body[0].findAll("p") | |||
| print("Title Recorded!") | |||
| local_progress['value'] = 10 | |||
| y = len(paragraphs) | |||
| x = 0 | |||
| fullText = "" | |||
| fullText2 = "" | |||
| for paragraph in paragraphs: | |||
| x = x + 1 | |||
| local_progress['value'] = x * 100 / y + 10 | |||
| stringx = str(x) | |||
| Labels.config(text = "Getting paragraph " + stringx + "...") | |||
| print(paragraph.text + "\n \n \n") | |||
| if x >= y/2: | |||
| fullText2 = fullText2 + paragraph.text.strip() | |||
| else: | |||
| fullText = fullText + paragraph.text.strip() | |||
| Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||
| print("Writing Paragraph " + stringx + "...") | |||
| if self.needToSkip: | |||
| break | |||
| if self.needToSkip: | |||
| self.needToSkip = False | |||
| continue | |||
| translatedQuery = translate(fullText, "ru", "en") | |||
| completeText = translatedQuery | |||
| translatedQuery = translate(fullText2, "ru", "en") | |||
| completeText = completeText + translatedQuery | |||
| f.write("\n" + fullText + fullText2) | |||
| t.write("\n" + completeText) | |||
| news_picture = news_soup.findAll("source", {"class":"hero-image"}) | |||
| Labels.config(text = "Getting image...") | |||
| if len(news_picture) > 0: | |||
| article_pic = news_picture[0].get("data-original-mos") | |||
| Labels.config(text = "Picture recieved!") | |||
| else: | |||
| print("\n THIS ARTICLE HAS NO PICTURE! ") | |||
| Labels.config(text = "Failed to locate picture :(") | |||
| local_progress['value'] = 120 | |||
| f.write("\n PICTURE URL: " + article_pic) | |||
| t.write("\n PICTURE URL: " + article_pic) | |||
| if self.stop_threads.is_set(): | |||
| print("I SURRENDER!") | |||
| self.stopped = True | |||
| f.close() | |||
| t.close() | |||
| self.CloseLabel.config(text = "you may close now") | |||
| sys.exit() | |||
| self.CloseLabel.config(text = "I tried, I failed") | |||
| break | |||
| else: | |||
| print("NOTHING IS STOPPING ME!") | |||
| Labels.config(text = "Finished the article!") | |||
| #brand = divWithInfo.div.a.img["title"] | |||
| #title_container = divWithInfo.find("a", "item-title") | |||
| #product_name = title_container.text | |||
| #shipping_container = divWithInfo.find("li", "price-ship") | |||
| #shipping_cost = shipping_container.text.strip() | |||
| #print("brand:"+brand) | |||
| #print("name:"+product_name) | |||
| #print("shipping:"+shipping_cost) | |||
| #print("\n") | |||
| #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||
| Labels.config(text = "All Done!") | |||
| f.close() | |||
| t.close() | |||
| self.CloseLabel.config(text = "you may close now") | |||
| sys.exit() | |||
| self.CloseLabel.config(text = "I tried, I failed") | |||
| break | |||
| else: | |||
| print("NOTHING IS STOPPING ME!") | |||
| Labels.config(text = "Finished the article!") | |||
| #brand = divWithInfo.div.a.img["title"] | |||
| #title_container = divWithInfo.find("a", "item-title") | |||
| #product_name = title_container.text | |||
| #shipping_container = divWithInfo.find("li", "price-ship") | |||
| #shipping_cost = shipping_container.text.strip() | |||
| else: | |||
| progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||
| progress['value'] = 0 | |||
| progress.pack(side = TOP) | |||
| Labels = Label(topFrame, text = "SCRAPING") | |||
| Labels.pack(side = TOP) | |||
| texts = "change" | |||
| main_url = 'https://www.theverge.com/tech' | |||
| uClient = uReq(main_url) | |||
| page_html = uClient.read() | |||
| uClient.close() | |||
| page_soup = soup(page_html, "html.parser") | |||
| containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||
| Articles = len(containers) | |||
| filename = self.directory + "/News.txt" | |||
| trans_filename = self.directory + "/TranslatedNews.txt" | |||
| f = io.open(filename, "w", encoding="utf-8") | |||
| f.write("ACTIVE") | |||
| t = io.open(trans_filename, "w", encoding ="utf-8") | |||
| t.write("ACTIVE") | |||
| Labels.config(text = "setting file!") | |||
| i = 0 | |||
| CurrentTitle = Label(topFrame, text = "Preparing...") | |||
| CurrentTitle.pack(side = TOP) | |||
| for container in containers: | |||
| i = i + 1 | |||
| Labels.config(text = "jumping to URL!") | |||
| print(container["class"]) | |||
| if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||
| print("\n WE'VE CATCHED A BUG!") | |||
| continue | |||
| if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||
| print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||
| print(container.div["class"]) | |||
| continue | |||
| progress['value'] = i * 100 / Articles | |||
| local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||
| local_progress['value'] = 0 | |||
| local_progress.pack(side = BOTTOM) | |||
| requiredURL = container.div.a["href"] | |||
| secondary_URL = requiredURL | |||
| print("Set target URL!") | |||
| secClient = uReq(secondary_URL) | |||
| news_html = secClient.read() | |||
| secClient.close() | |||
| news_soup = soup(news_html, "html.parser") | |||
| news_soup.decode('utf-8', 'ignore') | |||
| news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||
| if len(news_containers)>0: | |||
| news_title = news_containers[0].h1.text | |||
| CurrentTitle.config(text = news_title) | |||
| Labels.config(text = "Extracted Title!") | |||
| else: | |||
| print("ERROR! NO TITLE AT "+secondary_URL) | |||
| Labels.config(text = "Failed to extract title") | |||
| news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||
| print("\n TITLE: " + news_title) | |||
| f.write("\n \n" + news_title + "\n") | |||
| print("Now translating...") | |||
| translatedQuery = translate(news_title, "ru", "en") | |||
| t.write("\n \n" + translatedQuery + "\n") | |||
| paragraphs = news_body[0].findAll("p") | |||
| print("Title Recorded!") | |||
| local_progress['value'] = 10 | |||
| y = len(paragraphs) | |||
| x = 0 | |||
| fullText = "" | |||
| fullText2 = "" | |||
| for paragraph in paragraphs: | |||
| #print("brand:"+brand) | |||
| #print("name:"+product_name) | |||
| #print("shipping:"+shipping_cost) | |||
| #print("\n") | |||
| x = x + 1 | |||
| local_progress['value'] = x * 100 / y + 10 | |||
| stringx = str(x) | |||
| Labels.config(text = "Getting paragraph " + stringx + "...") | |||
| print(paragraph.text + "\n \n \n") | |||
| if x >= y/2: | |||
| fullText2 = fullText2 + paragraph.text.strip() | |||
| else: | |||
| fullText = fullText + paragraph.text.strip() | |||
| Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||
| print("Writing Paragraph " + stringx + "...") | |||
| if self.needToSkip: | |||
| break | |||
| if self.needToSkip: | |||
| self.needToSkip = False | |||
| continue | |||
| translatedQuery = translate(fullText, "ru", "en") | |||
| completeText = translatedQuery | |||
| translatedQuery = translate(fullText2, "ru", "en") | |||
| completeText = completeText + translatedQuery | |||
| f.write("\n" + fullText + fullText2) | |||
| t.write("\n" + completeText) | |||
| news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||
| Labels.config(text = "Getting image...") | |||
| if news_picture[0].img != None: | |||
| article_pic = news_picture[0].img.get("src") | |||
| Labels.config(text = "Picture recieved!") | |||
| else: | |||
| print("\n THIS ARTICLE HAS NO PICTURE! ") | |||
| Labels.config(text = "Failed to locate picture :(") | |||
| local_progress['value'] = 120 | |||
| f.write("\n PICTURE URL: " + article_pic) | |||
| t.write("\n PICTURE URL: " + article_pic) | |||
| if self.stop_threads.is_set(): | |||
| print("I SURRENDER!") | |||
| self.stopped = True | |||
| f.close() | |||
| t.close() | |||
| self.CloseLabel.config(text = "you may close now") | |||
| sys.exit() | |||
| self.CloseLabel.config(text = "I tried, I failed") | |||
| break | |||
| else: | |||
| print("NOTHING IS STOPPING ME!") | |||
| Labels.config(text = "Finished the article!") | |||
| #brand = divWithInfo.div.a.img["title"] | |||
| #title_container = divWithInfo.find("a", "item-title") | |||
| #product_name = title_container.text | |||
| #shipping_container = divWithInfo.find("li", "price-ship") | |||
| #shipping_cost = shipping_container.text.strip() | |||
| #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||
| Labels.config(text = "All Done!") | |||
| f.close() | |||
| t.close() | |||
| #print("brand:"+brand) | |||
| #print("name:"+product_name) | |||
| #print("shipping:"+shipping_cost) | |||
| #print("\n") | |||
| #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||
| Labels.config(text = "All Done!") | |||
| f.close() | |||
| t.close() | |||
| texts = "VERGE SCRAPPER" | |||
| root = Tk() | |||
| program = Scrapers() | |||
| @@ -214,9 +365,11 @@ topFrame.pack() | |||
| bottomFrame = Frame(root) | |||
| bottomFrame.pack(side=BOTTOM) | |||
| button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) | |||
| button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text'])) | |||
| button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) | |||
| button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) | |||
| button3.pack(side = TOP) | |||
| button1.pack(side= TOP) | |||
| button4.pack(side= TOP) | |||
| button2.pack(side = TOP) | |||
| root.mainloop() | |||