| @@ -0,0 +1,222 @@ | |||||
| import tkinter | |||||
| from tkinter import * | |||||
| from os.path import expanduser | |||||
| from tkinter import filedialog | |||||
| from tkinter.ttk import Progressbar | |||||
| from urllib.request import urlopen as uReq | |||||
| from bs4 import BeautifulSoup as soup | |||||
| import io | |||||
| from mtranslate import translate | |||||
| import playsound | |||||
| import sys | |||||
| import time | |||||
| from threading import * | |||||
| import os | |||||
| from tkinter import messagebox | |||||
| desktop = expanduser("~/Documents") | |||||
| def chooseDirectory(): | |||||
| currdir = os.getcwd() | |||||
| tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') | |||||
| program.directory = tempdir | |||||
| class Scrapers(object): | |||||
| def __init__(self): | |||||
| self.thread1 = None | |||||
| self.stop_threads = Event() | |||||
| self.stopped = False | |||||
| self.CloseLabel = Label(root, text = "Finalizing before breaking!") | |||||
| self.directory = desktop | |||||
| self.needToSkip = False | |||||
| def waitandkill(self): | |||||
| time.sleep(1) | |||||
| if (self.stopped == True): | |||||
| print("DEAD") | |||||
| else: | |||||
| self.waitandkill | |||||
| def stopTheThread(self): | |||||
| print("CALLED ME TOO?") | |||||
| self.stop_threads.set() | |||||
| self.CloseLabel.pack() | |||||
| self.waitandkill | |||||
| print("calling wait") | |||||
| def skip(self): | |||||
| self.needToSkip = True | |||||
| def start_thread(self): | |||||
| Skip = Button(topFrame, text = "SKIP!", command = self.skip) | |||||
| Skip.pack(side = BOTTOM) | |||||
| try: | |||||
| f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8") | |||||
| except IOError: | |||||
| print("FILE ERROR!" + self.directory + "/TranslatedNews.txt") | |||||
| messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt") | |||||
| sys.exit() | |||||
| try: | |||||
| f = io.open(self.directory + "/News.txt", "w", encoding="utf-8") | |||||
| except IOError: | |||||
| print("FILE ERROR!" + self.directory + "/News.txt") | |||||
| messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt") | |||||
| sys.exit() | |||||
| if self.thread1!=None: | |||||
| print("NO!") | |||||
| else: | |||||
| self.thread1 = Thread(target = self.start_now) | |||||
| self.thread1.start() | |||||
| threadActive = 1 | |||||
| def start_now(self): | |||||
| progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||||
| progress['value'] = 0 | |||||
| progress.pack(side = TOP) | |||||
| Labels = Label(topFrame, text = "SCRAPING") | |||||
| Labels.pack(side = TOP) | |||||
| texts = "change" | |||||
| main_url = 'https://www.theverge.com/tech' | |||||
| uClient = uReq(main_url) | |||||
| page_html = uClient.read() | |||||
| uClient.close() | |||||
| page_soup = soup(page_html, "html.parser") | |||||
| containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||||
| Articles = len(containers) | |||||
| filename = self.directory + "/News.txt" | |||||
| trans_filename = self.directory + "/TranslatedNews.txt" | |||||
| f = io.open(filename, "w", encoding="utf-8") | |||||
| f.write("ACTIVE") | |||||
| t = io.open(trans_filename, "w", encoding ="utf-8") | |||||
| t.write("ACTIVE") | |||||
| Labels.config(text = "setting file!") | |||||
| i = 0 | |||||
| CurrentTitle = Label(topFrame, text = "Preparing...") | |||||
| CurrentTitle.pack(side = TOP) | |||||
| for container in containers: | |||||
| i = i + 1 | |||||
| Labels.config(text = "jumping to URL!") | |||||
| print(container["class"]) | |||||
| if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||||
| print("\n WE'VE CATCHED A BUG!") | |||||
| continue | |||||
| if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||||
| print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||||
| print(container.div["class"]) | |||||
| continue | |||||
| progress['value'] = i * 100 / Articles | |||||
| local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||||
| local_progress['value'] = 0 | |||||
| local_progress.pack(side = BOTTOM) | |||||
| requiredURL = container.div.a["href"] | |||||
| secondary_URL = requiredURL | |||||
| print("Set target URL!") | |||||
| secClient = uReq(secondary_URL) | |||||
| news_html = secClient.read() | |||||
| secClient.close() | |||||
| news_soup = soup(news_html, "html.parser") | |||||
| news_soup.decode('utf-8', 'ignore') | |||||
| news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||||
| if len(news_containers)>0: | |||||
| news_title = news_containers[0].h1.text | |||||
| CurrentTitle.config(text = news_title) | |||||
| Labels.config(text = "Extracted Title!") | |||||
| else: | |||||
| print("ERROR! NO TITLE AT "+secondary_URL) | |||||
| Labels.config(text = "Failed to extract title") | |||||
| news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||||
| print("\n TITLE: " + news_title) | |||||
| f.write("\n \n" + news_title + "\n") | |||||
| print("Now translating...") | |||||
| translatedQuery = translate(news_title, "ru", "en") | |||||
| t.write("\n \n" + translatedQuery + "\n") | |||||
| paragraphs = news_body[0].findAll("p") | |||||
| print("Title Recorded!") | |||||
| local_progress['value'] = 10 | |||||
| y = len(paragraphs) | |||||
| x = 0 | |||||
| fullText = "" | |||||
| fullText2 = "" | |||||
| for paragraph in paragraphs: | |||||
| x = x + 1 | |||||
| local_progress['value'] = x * 100 / y + 10 | |||||
| stringx = str(x) | |||||
| Labels.config(text = "Getting paragraph " + stringx + "...") | |||||
| print(paragraph.text + "\n \n \n") | |||||
| if x >= y/2: | |||||
| fullText2 = fullText2 + paragraph.text.strip() | |||||
| else: | |||||
| fullText = fullText + paragraph.text.strip() | |||||
| Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||||
| print("Writing Paragraph " + stringx + "...") | |||||
| if self.needToSkip: | |||||
| break | |||||
| if self.needToSkip: | |||||
| self.needToSkip = False | |||||
| continue | |||||
| translatedQuery = translate(fullText, "ru", "en") | |||||
| completeText = translatedQuery | |||||
| translatedQuery = translate(fullText2, "ru", "en") | |||||
| completeText = completeText + translatedQuery | |||||
| f.write("\n" + fullText + fullText2) | |||||
| t.write("\n" + completeText) | |||||
| news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||||
| Labels.config(text = "Getting image...") | |||||
| if news_picture[0].img != None: | |||||
| article_pic = news_picture[0].img.get("src") | |||||
| Labels.config(text = "Picture recieved!") | |||||
| else: | |||||
| print("\n THIS ARTICLE HAS NO PICTURE! ") | |||||
| Labels.config(text = "Failed to locate picture :(") | |||||
| local_progress['value'] = 120 | |||||
| f.write("\n PICTURE URL: " + article_pic) | |||||
| t.write("\n PICTURE URL: " + article_pic) | |||||
| if self.stop_threads.is_set(): | |||||
| print("I SURRENDER!") | |||||
| self.stopped = True | |||||
| f.close() | |||||
| t.close() | |||||
| self.CloseLabel.config(text = "you may close now") | |||||
| sys.exit() | |||||
| self.CloseLabel.config(text = "I tried, I failed") | |||||
| break | |||||
| else: | |||||
| print("NOTHING IS STOPPING ME!") | |||||
| Labels.config(text = "Finished the article!") | |||||
| #brand = divWithInfo.div.a.img["title"] | |||||
| #title_container = divWithInfo.find("a", "item-title") | |||||
| #product_name = title_container.text | |||||
| #shipping_container = divWithInfo.find("li", "price-ship") | |||||
| #shipping_cost = shipping_container.text.strip() | |||||
| #print("brand:"+brand) | |||||
| #print("name:"+product_name) | |||||
| #print("shipping:"+shipping_cost) | |||||
| #print("\n") | |||||
| #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||||
| Labels.config(text = "All Done!") | |||||
| f.close() | |||||
| t.close() | |||||
| texts = "VERGE SCRAPPER" | |||||
| root = Tk() | |||||
| program = Scrapers() | |||||
| mainT = Thread(target=program.start_now) | |||||
| try: | |||||
| texts | |||||
| except NameError: | |||||
| theLabel = Label(root, text = "VERGE SCRAPER") | |||||
| theLabel.pack() | |||||
| print("NO TEXTS!") | |||||
| else: | |||||
| theLabel = Label(root, text = texts) | |||||
| theLabel.pack() | |||||
| print("FOUND TEXTS!") | |||||
| stop_thread = False | |||||
| topFrame = Frame(root) | |||||
| topFrame.pack() | |||||
| bottomFrame = Frame(root) | |||||
| bottomFrame.pack(side=BOTTOM) | |||||
| button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) | |||||
| button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) | |||||
| button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) | |||||
| button3.pack(side = TOP) | |||||
| button1.pack(side= TOP) | |||||
| button2.pack(side = TOP) | |||||
| root.mainloop() | |||||
| @@ -0,0 +1,229 @@ | |||||
| import tkinter | |||||
| from tkinter import * | |||||
| from os.path import expanduser | |||||
| from yandex.Translater import Translater | |||||
| from tkinter import filedialog | |||||
| from tkinter.ttk import Progressbar | |||||
| from urllib.request import urlopen as uReq | |||||
| from bs4 import BeautifulSoup as soup | |||||
| import io | |||||
| import playsound | |||||
| import sys | |||||
| import time | |||||
| from threading import * | |||||
| import os | |||||
| from tkinter import messagebox | |||||
| desktop = expanduser("~/Documents") | |||||
| tr = Translater() | |||||
| tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9') | |||||
| tr.set_from_lang('en') | |||||
| tr.set_to_lang('ru') | |||||
| def chooseDirectory(): | |||||
| currdir = os.getcwd() | |||||
| tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') | |||||
| program.directory = tempdir | |||||
| class Scrapers(object): | |||||
| def __init__(self): | |||||
| self.thread1 = None | |||||
| self.stop_threads = Event() | |||||
| self.stopped = False | |||||
| self.CloseLabel = Label(root, text = "Finalizing before breaking!") | |||||
| self.directory = desktop | |||||
| self.needToSkip = False | |||||
| def waitandkill(self): | |||||
| time.sleep(1) | |||||
| if (self.stopped == True): | |||||
| print("DEAD") | |||||
| else: | |||||
| self.waitandkill | |||||
| def stopTheThread(self): | |||||
| print("CALLED ME TOO?") | |||||
| self.stop_threads.set() | |||||
| self.CloseLabel.pack() | |||||
| self.waitandkill | |||||
| print("calling wait") | |||||
| def skip(self): | |||||
| self.needToSkip = True | |||||
| def start_thread(self): | |||||
| Skip = Button(topFrame, text = "SKIP!", command = self.skip) | |||||
| Skip.pack(side = BOTTOM) | |||||
| try: | |||||
| f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8") | |||||
| except IOError: | |||||
| print("FILE ERROR!" + self.directory + "/TranslatedNews.txt") | |||||
| messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt") | |||||
| sys.exit() | |||||
| try: | |||||
| f = io.open(self.directory + "/News.txt", "w", encoding="utf-8") | |||||
| except IOError: | |||||
| print("FILE ERROR!" + self.directory + "/News.txt") | |||||
| messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt") | |||||
| sys.exit() | |||||
| if self.thread1!=None: | |||||
| print("NO!") | |||||
| else: | |||||
| self.thread1 = Thread(target = self.start_now) | |||||
| self.thread1.start() | |||||
| threadActive = 1 | |||||
| def start_now(self): | |||||
| progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||||
| progress['value'] = 0 | |||||
| progress.pack(side = TOP) | |||||
| Labels = Label(topFrame, text = "SCRAPING") | |||||
| Labels.pack(side = TOP) | |||||
| texts = "change" | |||||
| main_url = 'https://www.theverge.com/tech' | |||||
| uClient = uReq(main_url) | |||||
| page_html = uClient.read() | |||||
| uClient.close() | |||||
| page_soup = soup(page_html, "html.parser") | |||||
| containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||||
| Articles = len(containers) | |||||
| filename = self.directory + "/News.txt" | |||||
| trans_filename = self.directory + "/TranslatedNews.txt" | |||||
| f = io.open(filename, "w", encoding="utf-8") | |||||
| f.write("ACTIVE") | |||||
| t = io.open(trans_filename, "w", encoding ="utf-8") | |||||
| t.write("ACTIVE") | |||||
| Labels.config(text = "setting file!") | |||||
| i = 0 | |||||
| CurrentTitle = Label(topFrame, text = "Preparing...") | |||||
| CurrentTitle.pack(side = TOP) | |||||
| for container in containers: | |||||
| i = i + 1 | |||||
| Labels.config(text = "jumping to URL!") | |||||
| print(container["class"]) | |||||
| if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||||
| print("\n WE'VE CATCHED A BUG!") | |||||
| continue | |||||
| if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||||
| print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||||
| print(container.div["class"]) | |||||
| continue | |||||
| progress['value'] = i * 100 / Articles | |||||
| local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||||
| local_progress['value'] = 0 | |||||
| local_progress.pack(side = BOTTOM) | |||||
| requiredURL = container.div.a["href"] | |||||
| secondary_URL = requiredURL | |||||
| print("Set target URL!") | |||||
| secClient = uReq(secondary_URL) | |||||
| news_html = secClient.read() | |||||
| secClient.close() | |||||
| news_soup = soup(news_html, "html.parser") | |||||
| news_soup.decode('utf-8', 'ignore') | |||||
| news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||||
| if len(news_containers)>0: | |||||
| news_title = news_containers[0].h1.text | |||||
| CurrentTitle.config(text = news_title) | |||||
| Labels.config(text = "Extracted Title!") | |||||
| else: | |||||
| print("ERROR! NO TITLE AT "+secondary_URL) | |||||
| Labels.config(text = "Failed to extract title") | |||||
| news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||||
| print("\n TITLE: " + news_title) | |||||
| f.write("\n \n" + news_title + "\n") | |||||
| print("Now translating...") | |||||
| tr.set_text(news_title) | |||||
| translatedQuery = tr.translate() | |||||
| t.write("\n \n" + translatedQuery + "\n") | |||||
| paragraphs = news_body[0].findAll("p") | |||||
| print("Title Recorded!") | |||||
| local_progress['value'] = 10 | |||||
| y = len(paragraphs) | |||||
| x = 0 | |||||
| fullText = "" | |||||
| fullText2 = "" | |||||
| for paragraph in paragraphs: | |||||
| x = x + 1 | |||||
| local_progress['value'] = x * 100 / y + 10 | |||||
| stringx = str(x) | |||||
| Labels.config(text = "Getting paragraph " + stringx + "...") | |||||
| print(paragraph.text + "\n \n \n") | |||||
| if x >= y/2: | |||||
| fullText2 = fullText2 + paragraph.text.strip() | |||||
| else: | |||||
| fullText = fullText + paragraph.text.strip() | |||||
| Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||||
| print("Writing Paragraph " + stringx + "...") | |||||
| if self.needToSkip: | |||||
| break | |||||
| if self.needToSkip: | |||||
| self.needToSkip = False | |||||
| continue | |||||
| tr.set_text((fullText)) | |||||
| translatedQuery = tr.translate() | |||||
| completeText = translatedQuery | |||||
| tr.set_text((fullText)) | |||||
| translatedQuery = tr.translate() | |||||
| completeText = completeText + translatedQuery | |||||
| f.write("\n" + fullText + fullText2) | |||||
| t.write("\n" + completeText) | |||||
| news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||||
| Labels.config(text = "Getting image...") | |||||
| if news_picture[0].img != None: | |||||
| article_pic = news_picture[0].img.get("src") | |||||
| Labels.config(text = "Picture recieved!") | |||||
| else: | |||||
| print("\n THIS ARTICLE HAS NO PICTURE! ") | |||||
| Labels.config(text = "Failed to locate picture :(") | |||||
| local_progress['value'] = 120 | |||||
| f.write("\n PICTURE URL: " + article_pic) | |||||
| t.write("\n PICTURE URL: " + article_pic) | |||||
| if self.stop_threads.is_set(): | |||||
| print("I SURRENDER!") | |||||
| self.stopped = True | |||||
| f.close() | |||||
| t.close() | |||||
| self.CloseLabel.config(text = "you may close now") | |||||
| sys.exit() | |||||
| self.CloseLabel.config(text = "I tried, I failed") | |||||
| break | |||||
| else: | |||||
| print("NOTHING IS STOPPING ME!") | |||||
| Labels.config(text = "Finished the article!") | |||||
| #brand = divWithInfo.div.a.img["title"] | |||||
| #title_container = divWithInfo.find("a", "item-title") | |||||
| #product_name = title_container.text | |||||
| #shipping_container = divWithInfo.find("li", "price-ship") | |||||
| #shipping_cost = shipping_container.text.strip() | |||||
| #print("brand:"+brand) | |||||
| #print("name:"+product_name) | |||||
| #print("shipping:"+shipping_cost) | |||||
| #print("\n") | |||||
| #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||||
| Labels.config(text = "All Done!") | |||||
| f.close() | |||||
| t.close() | |||||
| texts = "VERGE SCRAPPER" | |||||
| root = Tk() | |||||
| program = Scrapers() | |||||
| mainT = Thread(target=program.start_now) | |||||
| try: | |||||
| texts | |||||
| except NameError: | |||||
| theLabel = Label(root, text = "VERGE SCRAPER") | |||||
| theLabel.pack() | |||||
| print("NO TEXTS!") | |||||
| else: | |||||
| theLabel = Label(root, text = texts) | |||||
| theLabel.pack() | |||||
| print("FOUND TEXTS!") | |||||
| stop_thread = False | |||||
| topFrame = Frame(root) | |||||
| topFrame.pack() | |||||
| bottomFrame = Frame(root) | |||||
| bottomFrame.pack(side=BOTTOM) | |||||
| button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) | |||||
| button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) | |||||
| button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) | |||||
| button3.pack(side = TOP) | |||||
| button1.pack(side= TOP) | |||||
| button2.pack(side = TOP) | |||||
| root.mainloop() | |||||
| @@ -0,0 +1,6 @@ | |||||
| #Hi, welcome to Verge Scrapper. A tool used for scraping the Verge's website for news and translating them into russian language. | |||||
| To use the tool, just launch the .pyw executable with python3. | |||||
| <b>Warning! YANDEX.PYW USES YANDEX TRANSLATION API, WHILE GOOGLE.PYW USES GOOGLE TRANSLATE</b> | |||||
| @@ -0,0 +1,20 @@ | |||||
| from distutils.core import setup | |||||
| setup( | |||||
| name = 'mtranslate', | |||||
| packages = ['mtranslate'], | |||||
| version = '1.6', | |||||
| description = 'Google translate console script with easy to use API', | |||||
| author = 'Arnaud Alies', | |||||
| author_email = 'arnaudalies.py@gmail.com', | |||||
| url = 'https://github.com/mouuff/mtranslate', | |||||
| download_url = 'https://github.com/mouuff/mtranslate/tarball/1.6', | |||||
| keywords = ['console', 'translate', 'translator', 'simple', 'google', 'language'], | |||||
| classifiers = [], | |||||
| entry_points={ | |||||
| 'console_scripts': [ | |||||
| 'mtranslate = mtranslate.__main__:main' | |||||
| ] | |||||
| }, | |||||
| ) | |||||