@@ -0,0 +1,222 @@ | |||
import tkinter | |||
from tkinter import * | |||
from os.path import expanduser | |||
from tkinter import filedialog | |||
from tkinter.ttk import Progressbar | |||
from urllib.request import urlopen as uReq | |||
from bs4 import BeautifulSoup as soup | |||
import io | |||
from mtranslate import translate | |||
import playsound | |||
import sys | |||
import time | |||
from threading import * | |||
import os | |||
from tkinter import messagebox | |||
desktop = expanduser("~/Documents") | |||
def chooseDirectory(): | |||
currdir = os.getcwd() | |||
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') | |||
program.directory = tempdir | |||
class Scrapers(object): | |||
def __init__(self): | |||
self.thread1 = None | |||
self.stop_threads = Event() | |||
self.stopped = False | |||
self.CloseLabel = Label(root, text = "Finalizing before breaking!") | |||
self.directory = desktop | |||
self.needToSkip = False | |||
def waitandkill(self): | |||
time.sleep(1) | |||
if (self.stopped == True): | |||
print("DEAD") | |||
else: | |||
self.waitandkill | |||
def stopTheThread(self): | |||
print("CALLED ME TOO?") | |||
self.stop_threads.set() | |||
self.CloseLabel.pack() | |||
self.waitandkill | |||
print("calling wait") | |||
def skip(self): | |||
self.needToSkip = True | |||
def start_thread(self): | |||
Skip = Button(topFrame, text = "SKIP!", command = self.skip) | |||
Skip.pack(side = BOTTOM) | |||
try: | |||
f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8") | |||
except IOError: | |||
print("FILE ERROR!" + self.directory + "/TranslatedNews.txt") | |||
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt") | |||
sys.exit() | |||
try: | |||
f = io.open(self.directory + "/News.txt", "w", encoding="utf-8") | |||
except IOError: | |||
print("FILE ERROR!" + self.directory + "/News.txt") | |||
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt") | |||
sys.exit() | |||
if self.thread1!=None: | |||
print("NO!") | |||
else: | |||
self.thread1 = Thread(target = self.start_now) | |||
self.thread1.start() | |||
threadActive = 1 | |||
def start_now(self): | |||
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||
progress['value'] = 0 | |||
progress.pack(side = TOP) | |||
Labels = Label(topFrame, text = "SCRAPING") | |||
Labels.pack(side = TOP) | |||
texts = "change" | |||
main_url = 'https://www.theverge.com/tech' | |||
uClient = uReq(main_url) | |||
page_html = uClient.read() | |||
uClient.close() | |||
page_soup = soup(page_html, "html.parser") | |||
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||
Articles = len(containers) | |||
filename = self.directory + "/News.txt" | |||
trans_filename = self.directory + "/TranslatedNews.txt" | |||
f = io.open(filename, "w", encoding="utf-8") | |||
f.write("ACTIVE") | |||
t = io.open(trans_filename, "w", encoding ="utf-8") | |||
t.write("ACTIVE") | |||
Labels.config(text = "setting file!") | |||
i = 0 | |||
CurrentTitle = Label(topFrame, text = "Preparing...") | |||
CurrentTitle.pack(side = TOP) | |||
for container in containers: | |||
i = i + 1 | |||
Labels.config(text = "jumping to URL!") | |||
print(container["class"]) | |||
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||
print("\n WE'VE CATCHED A BUG!") | |||
continue | |||
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||
print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||
print(container.div["class"]) | |||
continue | |||
progress['value'] = i * 100 / Articles | |||
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||
local_progress['value'] = 0 | |||
local_progress.pack(side = BOTTOM) | |||
requiredURL = container.div.a["href"] | |||
secondary_URL = requiredURL | |||
print("Set target URL!") | |||
secClient = uReq(secondary_URL) | |||
news_html = secClient.read() | |||
secClient.close() | |||
news_soup = soup(news_html, "html.parser") | |||
news_soup.decode('utf-8', 'ignore') | |||
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||
if len(news_containers)>0: | |||
news_title = news_containers[0].h1.text | |||
CurrentTitle.config(text = news_title) | |||
Labels.config(text = "Extracted Title!") | |||
else: | |||
print("ERROR! NO TITLE AT "+secondary_URL) | |||
Labels.config(text = "Failed to extract title") | |||
news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||
print("\n TITLE: " + news_title) | |||
f.write("\n \n" + news_title + "\n") | |||
print("Now translating...") | |||
translatedQuery = translate(news_title, "ru", "en") | |||
t.write("\n \n" + translatedQuery + "\n") | |||
paragraphs = news_body[0].findAll("p") | |||
print("Title Recorded!") | |||
local_progress['value'] = 10 | |||
y = len(paragraphs) | |||
x = 0 | |||
fullText = "" | |||
fullText2 = "" | |||
for paragraph in paragraphs: | |||
x = x + 1 | |||
local_progress['value'] = x * 100 / y + 10 | |||
stringx = str(x) | |||
Labels.config(text = "Getting paragraph " + stringx + "...") | |||
print(paragraph.text + "\n \n \n") | |||
if x >= y/2: | |||
fullText2 = fullText2 + paragraph.text.strip() | |||
else: | |||
fullText = fullText + paragraph.text.strip() | |||
Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||
print("Writing Paragraph " + stringx + "...") | |||
if self.needToSkip: | |||
break | |||
if self.needToSkip: | |||
self.needToSkip = False | |||
continue | |||
translatedQuery = translate(fullText, "ru", "en") | |||
completeText = translatedQuery | |||
translatedQuery = translate(fullText2, "ru", "en") | |||
completeText = completeText + translatedQuery | |||
f.write("\n" + fullText + fullText2) | |||
t.write("\n" + completeText) | |||
news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||
Labels.config(text = "Getting image...") | |||
if news_picture[0].img != None: | |||
article_pic = news_picture[0].img.get("src") | |||
Labels.config(text = "Picture recieved!") | |||
else: | |||
print("\n THIS ARTICLE HAS NO PICTURE! ") | |||
Labels.config(text = "Failed to locate picture :(") | |||
local_progress['value'] = 120 | |||
f.write("\n PICTURE URL: " + article_pic) | |||
t.write("\n PICTURE URL: " + article_pic) | |||
if self.stop_threads.is_set(): | |||
print("I SURRENDER!") | |||
self.stopped = True | |||
f.close() | |||
t.close() | |||
self.CloseLabel.config(text = "you may close now") | |||
sys.exit() | |||
self.CloseLabel.config(text = "I tried, I failed") | |||
break | |||
else: | |||
print("NOTHING IS STOPPING ME!") | |||
Labels.config(text = "Finished the article!") | |||
#brand = divWithInfo.div.a.img["title"] | |||
#title_container = divWithInfo.find("a", "item-title") | |||
#product_name = title_container.text | |||
#shipping_container = divWithInfo.find("li", "price-ship") | |||
#shipping_cost = shipping_container.text.strip() | |||
#print("brand:"+brand) | |||
#print("name:"+product_name) | |||
#print("shipping:"+shipping_cost) | |||
#print("\n") | |||
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||
Labels.config(text = "All Done!") | |||
f.close() | |||
t.close() | |||
texts = "VERGE SCRAPPER" | |||
root = Tk() | |||
program = Scrapers() | |||
mainT = Thread(target=program.start_now) | |||
try: | |||
texts | |||
except NameError: | |||
theLabel = Label(root, text = "VERGE SCRAPER") | |||
theLabel.pack() | |||
print("NO TEXTS!") | |||
else: | |||
theLabel = Label(root, text = texts) | |||
theLabel.pack() | |||
print("FOUND TEXTS!") | |||
stop_thread = False | |||
topFrame = Frame(root) | |||
topFrame.pack() | |||
bottomFrame = Frame(root) | |||
bottomFrame.pack(side=BOTTOM) | |||
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) | |||
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) | |||
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) | |||
button3.pack(side = TOP) | |||
button1.pack(side= TOP) | |||
button2.pack(side = TOP) | |||
root.mainloop() |
@@ -0,0 +1,229 @@ | |||
import tkinter | |||
from tkinter import * | |||
from os.path import expanduser | |||
from yandex.Translater import Translater | |||
from tkinter import filedialog | |||
from tkinter.ttk import Progressbar | |||
from urllib.request import urlopen as uReq | |||
from bs4 import BeautifulSoup as soup | |||
import io | |||
import playsound | |||
import sys | |||
import time | |||
from threading import * | |||
import os | |||
from tkinter import messagebox | |||
desktop = expanduser("~/Documents") | |||
tr = Translater() | |||
tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9') | |||
tr.set_from_lang('en') | |||
tr.set_to_lang('ru') | |||
def chooseDirectory(): | |||
currdir = os.getcwd() | |||
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory') | |||
program.directory = tempdir | |||
class Scrapers(object): | |||
def __init__(self): | |||
self.thread1 = None | |||
self.stop_threads = Event() | |||
self.stopped = False | |||
self.CloseLabel = Label(root, text = "Finalizing before breaking!") | |||
self.directory = desktop | |||
self.needToSkip = False | |||
def waitandkill(self): | |||
time.sleep(1) | |||
if (self.stopped == True): | |||
print("DEAD") | |||
else: | |||
self.waitandkill | |||
def stopTheThread(self): | |||
print("CALLED ME TOO?") | |||
self.stop_threads.set() | |||
self.CloseLabel.pack() | |||
self.waitandkill | |||
print("calling wait") | |||
def skip(self): | |||
self.needToSkip = True | |||
def start_thread(self): | |||
Skip = Button(topFrame, text = "SKIP!", command = self.skip) | |||
Skip.pack(side = BOTTOM) | |||
try: | |||
f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8") | |||
except IOError: | |||
print("FILE ERROR!" + self.directory + "/TranslatedNews.txt") | |||
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt") | |||
sys.exit() | |||
try: | |||
f = io.open(self.directory + "/News.txt", "w", encoding="utf-8") | |||
except IOError: | |||
print("FILE ERROR!" + self.directory + "/News.txt") | |||
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt") | |||
sys.exit() | |||
if self.thread1!=None: | |||
print("NO!") | |||
else: | |||
self.thread1 = Thread(target = self.start_now) | |||
self.thread1.start() | |||
threadActive = 1 | |||
def start_now(self): | |||
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate') | |||
progress['value'] = 0 | |||
progress.pack(side = TOP) | |||
Labels = Label(topFrame, text = "SCRAPING") | |||
Labels.pack(side = TOP) | |||
texts = "change" | |||
main_url = 'https://www.theverge.com/tech' | |||
uClient = uReq(main_url) | |||
page_html = uClient.read() | |||
uClient.close() | |||
page_soup = soup(page_html, "html.parser") | |||
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"}) | |||
Articles = len(containers) | |||
filename = self.directory + "/News.txt" | |||
trans_filename = self.directory + "/TranslatedNews.txt" | |||
f = io.open(filename, "w", encoding="utf-8") | |||
f.write("ACTIVE") | |||
t = io.open(trans_filename, "w", encoding ="utf-8") | |||
t.write("ACTIVE") | |||
Labels.config(text = "setting file!") | |||
i = 0 | |||
CurrentTitle = Label(topFrame, text = "Preparing...") | |||
CurrentTitle.pack(side = TOP) | |||
for container in containers: | |||
i = i + 1 | |||
Labels.config(text = "jumping to URL!") | |||
print(container["class"]) | |||
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']: | |||
print("\n WE'VE CATCHED A BUG!") | |||
continue | |||
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]: | |||
print("\n WARNING! THIS IS NOT AN ARTICLE! ") | |||
print(container.div["class"]) | |||
continue | |||
progress['value'] = i * 100 / Articles | |||
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate') | |||
local_progress['value'] = 0 | |||
local_progress.pack(side = BOTTOM) | |||
requiredURL = container.div.a["href"] | |||
secondary_URL = requiredURL | |||
print("Set target URL!") | |||
secClient = uReq(secondary_URL) | |||
news_html = secClient.read() | |||
secClient.close() | |||
news_soup = soup(news_html, "html.parser") | |||
news_soup.decode('utf-8', 'ignore') | |||
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"}) | |||
if len(news_containers)>0: | |||
news_title = news_containers[0].h1.text | |||
CurrentTitle.config(text = news_title) | |||
Labels.config(text = "Extracted Title!") | |||
else: | |||
print("ERROR! NO TITLE AT "+secondary_URL) | |||
Labels.config(text = "Failed to extract title") | |||
news_body = news_soup.findAll("div", {"class":"c-entry-content"}) | |||
print("\n TITLE: " + news_title) | |||
f.write("\n \n" + news_title + "\n") | |||
print("Now translating...") | |||
tr.set_text(news_title) | |||
translatedQuery = tr.translate() | |||
t.write("\n \n" + translatedQuery + "\n") | |||
paragraphs = news_body[0].findAll("p") | |||
print("Title Recorded!") | |||
local_progress['value'] = 10 | |||
y = len(paragraphs) | |||
x = 0 | |||
fullText = "" | |||
fullText2 = "" | |||
for paragraph in paragraphs: | |||
x = x + 1 | |||
local_progress['value'] = x * 100 / y + 10 | |||
stringx = str(x) | |||
Labels.config(text = "Getting paragraph " + stringx + "...") | |||
print(paragraph.text + "\n \n \n") | |||
if x >= y/2: | |||
fullText2 = fullText2 + paragraph.text.strip() | |||
else: | |||
fullText = fullText + paragraph.text.strip() | |||
Labels.config(text = "Written and Translated Paragraph" + stringx + "!") | |||
print("Writing Paragraph " + stringx + "...") | |||
if self.needToSkip: | |||
break | |||
if self.needToSkip: | |||
self.needToSkip = False | |||
continue | |||
tr.set_text((fullText)) | |||
translatedQuery = tr.translate() | |||
completeText = translatedQuery | |||
tr.set_text((fullText)) | |||
translatedQuery = tr.translate() | |||
completeText = completeText + translatedQuery | |||
f.write("\n" + fullText + fullText2) | |||
t.write("\n" + completeText) | |||
news_picture = news_soup.findAll("picture", {"class":"c-picture"}) | |||
Labels.config(text = "Getting image...") | |||
if news_picture[0].img != None: | |||
article_pic = news_picture[0].img.get("src") | |||
Labels.config(text = "Picture recieved!") | |||
else: | |||
print("\n THIS ARTICLE HAS NO PICTURE! ") | |||
Labels.config(text = "Failed to locate picture :(") | |||
local_progress['value'] = 120 | |||
f.write("\n PICTURE URL: " + article_pic) | |||
t.write("\n PICTURE URL: " + article_pic) | |||
if self.stop_threads.is_set(): | |||
print("I SURRENDER!") | |||
self.stopped = True | |||
f.close() | |||
t.close() | |||
self.CloseLabel.config(text = "you may close now") | |||
sys.exit() | |||
self.CloseLabel.config(text = "I tried, I failed") | |||
break | |||
else: | |||
print("NOTHING IS STOPPING ME!") | |||
Labels.config(text = "Finished the article!") | |||
#brand = divWithInfo.div.a.img["title"] | |||
#title_container = divWithInfo.find("a", "item-title") | |||
#product_name = title_container.text | |||
#shipping_container = divWithInfo.find("li", "price-ship") | |||
#shipping_cost = shipping_container.text.strip() | |||
#print("brand:"+brand) | |||
#print("name:"+product_name) | |||
#print("shipping:"+shipping_cost) | |||
#print("\n") | |||
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n") | |||
Labels.config(text = "All Done!") | |||
f.close() | |||
t.close() | |||
texts = "VERGE SCRAPPER" | |||
root = Tk() | |||
program = Scrapers() | |||
mainT = Thread(target=program.start_now) | |||
try: | |||
texts | |||
except NameError: | |||
theLabel = Label(root, text = "VERGE SCRAPER") | |||
theLabel.pack() | |||
print("NO TEXTS!") | |||
else: | |||
theLabel = Label(root, text = texts) | |||
theLabel.pack() | |||
print("FOUND TEXTS!") | |||
stop_thread = False | |||
topFrame = Frame(root) | |||
topFrame.pack() | |||
bottomFrame = Frame(root) | |||
bottomFrame.pack(side=BOTTOM) | |||
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread) | |||
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory) | |||
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread) | |||
button3.pack(side = TOP) | |||
button1.pack(side= TOP) | |||
button2.pack(side = TOP) | |||
root.mainloop() |
@@ -0,0 +1,6 @@ | |||
#Hi, welcome to Verge Scrapper. A tool used for scraping the Verge's website for news and translating them into russian language. | |||
To use the tool, just launch the .pyw executable with python3. | |||
<b>Warning! YANDEX.PYW USES YANDEX TRANSLATION API, WHILE GOOGLE.PYW USES GOOGLE TRANSLATE</b> |
@@ -0,0 +1,20 @@ | |||
from distutils.core import setup | |||
setup( | |||
name = 'mtranslate', | |||
packages = ['mtranslate'], | |||
version = '1.6', | |||
description = 'Google translate console script with easy to use API', | |||
author = 'Arnaud Alies', | |||
author_email = 'arnaudalies.py@gmail.com', | |||
url = 'https://github.com/mouuff/mtranslate', | |||
download_url = 'https://github.com/mouuff/mtranslate/tarball/1.6', | |||
keywords = ['console', 'translate', 'translator', 'simple', 'google', 'language'], | |||
classifiers = [], | |||
entry_points={ | |||
'console_scripts': [ | |||
'mtranslate = mtranslate.__main__:main' | |||
] | |||
}, | |||
) |