From adab35eb0010a7f90ca18460c14be9dd45633502 Mon Sep 17 00:00:00 2001
From: madiwka3 <madiwka3@madi>
Date: Sun, 21 Jun 2020 18:38:07 +0600
Subject: [PATCH] Added git repo

---
 GOOGLE.pyw | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++
 YANDEX.pyw | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 readme.md  |   6 ++
 setup.py   |  20 +++++
 4 files changed, 477 insertions(+)
 create mode 100755 GOOGLE.pyw
 create mode 100755 YANDEX.pyw
 create mode 100644 readme.md
 create mode 100755 setup.py

diff --git a/GOOGLE.pyw b/GOOGLE.pyw
new file mode 100755
index 0000000..99eef0d
--- /dev/null
+++ b/GOOGLE.pyw
@@ -0,0 +1,222 @@
+import tkinter
+from tkinter import *
+from os.path import expanduser
+
+from tkinter import filedialog
+from tkinter.ttk import Progressbar
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+import io
+from mtranslate import translate
+import playsound
+import sys
+import time
+from threading import *
+import os
+from tkinter import messagebox
+desktop = expanduser("~/Documents")
+def chooseDirectory():
+    currdir = os.getcwd()
+    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
+    program.directory = tempdir
+class Scrapers(object):
+    def __init__(self):
+        self.thread1 = None
+        self.stop_threads = Event()
+        self.stopped = False
+        self.CloseLabel = Label(root, text = "Finalizing before breaking!")
+        self.directory = desktop
+        self.needToSkip = False
+    def waitandkill(self):
+        time.sleep(1)
+        if (self.stopped == True):
+            print("DEAD")
+        else:
+            self.waitandkill
+    def stopTheThread(self):
+        print("CALLED ME TOO?")
+        self.stop_threads.set()
+        self.CloseLabel.pack()
+        self.waitandkill
+        print("calling wait")
+    def skip(self):
+        self.needToSkip = True
+    def start_thread(self):
+        Skip = Button(topFrame, text = "SKIP!", command = self.skip)
+        Skip.pack(side = BOTTOM)
+        try:
+            f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
+        except IOError:
+            print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
+            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
+            sys.exit()
+        try:
+            f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
+        except IOError:
+            print("FILE ERROR!" + self.directory + "/News.txt")
+            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
+            sys.exit()
+        if self.thread1!=None:
+            print("NO!")
+        else:
+            self.thread1 = Thread(target = self.start_now)
+            self.thread1.start()
+            threadActive = 1
+    def start_now(self):
+        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
+        progress['value'] = 0
+        progress.pack(side = TOP) 
+        Labels = Label(topFrame, text = "SCRAPING")
+        Labels.pack(side = TOP)
+        texts = "change"
+        main_url = 'https://www.theverge.com/tech'
+        uClient = uReq(main_url)
+        page_html = uClient.read()
+        uClient.close()
+        page_soup = soup(page_html, "html.parser")
+        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
+        Articles = len(containers)
+        filename = self.directory + "/News.txt"
+        trans_filename = self.directory + "/TranslatedNews.txt"
+        f = io.open(filename, "w", encoding="utf-8")
+        f.write("ACTIVE")
+        t = io.open(trans_filename, "w", encoding ="utf-8")
+        t.write("ACTIVE")
+        Labels.config(text = "setting file!")
+        i = 0
+        CurrentTitle = Label(topFrame, text = "Preparing...")
+        CurrentTitle.pack(side = TOP)
+        for container in containers:
+          i = i + 1 
+          Labels.config(text = "jumping to URL!")
+          print(container["class"])
+          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
+             print("\n WE'VE CATCHED A BUG!")
+             continue
+          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
+             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
+             print(container.div["class"])
+             continue
+          progress['value'] = i * 100 / Articles
+          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
+          local_progress['value'] = 0
+          local_progress.pack(side = BOTTOM)
+          requiredURL = container.div.a["href"]
+          secondary_URL = requiredURL
+          print("Set target URL!")
+          secClient = uReq(secondary_URL)
+          news_html = secClient.read()
+          secClient.close()
+          news_soup = soup(news_html, "html.parser")
+          news_soup.decode('utf-8', 'ignore')
+          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
+          if len(news_containers)>0:
+             news_title = news_containers[0].h1.text
+             CurrentTitle.config(text = news_title)
+             Labels.config(text = "Extracted Title!")
+          else:
+             print("ERROR! NO TITLE AT "+secondary_URL)
+             Labels.config(text = "Failed to extract title")
+          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
+          print("\n TITLE: " + news_title)
+          f.write("\n \n" + news_title + "\n")
+          print("Now translating...")
+          translatedQuery = translate(news_title, "ru", "en")
+          t.write("\n \n" + translatedQuery + "\n")
+          paragraphs = news_body[0].findAll("p")
+          print("Title Recorded!")
+          local_progress['value'] = 10
+          y = len(paragraphs)
+          x = 0
+          fullText = ""
+          fullText2 = ""
+          for paragraph in paragraphs:
+
+               x = x + 1
+               local_progress['value'] = x * 100 / y + 10
+               stringx = str(x)         
+               Labels.config(text = "Getting paragraph " + stringx + "...")
+               print(paragraph.text + "\n \n \n")
+               if x >= y/2:
+                   fullText2 = fullText2 + paragraph.text.strip()
+               else:
+                   fullText = fullText + paragraph.text.strip()
+               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
+               print("Writing Paragraph " + stringx + "...")
+               if self.needToSkip:
+                break
+               
+          if self.needToSkip:
+            self.needToSkip = False
+            continue
+          translatedQuery = translate(fullText, "ru", "en")
+          completeText = translatedQuery
+          translatedQuery = translate(fullText2, "ru", "en")
+          completeText = completeText + translatedQuery
+          f.write("\n" + fullText + fullText2)
+          t.write("\n" + completeText)
+          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
+          Labels.config(text = "Getting image...")
+          if news_picture[0].img != None:
+             article_pic = news_picture[0].img.get("src")
+             Labels.config(text = "Picture recieved!")
+          else:
+             print("\n THIS ARTICLE HAS NO PICTURE! ")
+             Labels.config(text = "Failed to locate picture :(")
+          local_progress['value'] = 120
+          f.write("\n PICTURE URL: " + article_pic)
+          t.write("\n PICTURE URL: " + article_pic)
+          if self.stop_threads.is_set():
+            print("I SURRENDER!")
+            self.stopped = True
+            f.close()
+            t.close()
+            self.CloseLabel.config(text = "you may close now")
+            sys.exit() 
+            self.CloseLabel.config(text = "I tried, I failed")
+            break
+          else:
+            print("NOTHING IS STOPPING ME!")
+            Labels.config(text = "Finished the article!")
+        #brand = divWithInfo.div.a.img["title"]
+        #title_container = divWithInfo.find("a", "item-title")
+        #product_name = title_container.text
+        #shipping_container = divWithInfo.find("li", "price-ship")
+        #shipping_cost = shipping_container.text.strip()
+
+        #print("brand:"+brand)
+        #print("name:"+product_name)
+        #print("shipping:"+shipping_cost)
+        #print("\n")
+
+        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
+        Labels.config(text = "All Done!")
+        f.close()
+        t.close()
+texts = "VERGE SCRAPPER"
+root = Tk()
+program = Scrapers()
+mainT = Thread(target=program.start_now)
+try:
+    texts
+except NameError:
+    theLabel = Label(root, text = "VERGE SCRAPER")
+    theLabel.pack()
+    print("NO TEXTS!")
+else:
+    theLabel = Label(root, text = texts)
+    theLabel.pack()
+    print("FOUND TEXTS!")
+
+stop_thread = False
+topFrame = Frame(root)
+topFrame.pack()
+bottomFrame = Frame(root)
+bottomFrame.pack(side=BOTTOM)
+button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
+button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
+button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
+button3.pack(side = TOP)
+button1.pack(side= TOP)
+button2.pack(side = TOP)
+root.mainloop()
\ No newline at end of file
diff --git a/YANDEX.pyw b/YANDEX.pyw
new file mode 100755
index 0000000..2c1d406
--- /dev/null
+++ b/YANDEX.pyw
@@ -0,0 +1,229 @@
+import tkinter
+from tkinter import *
+from os.path import expanduser
+from yandex.Translater import Translater
+
+from tkinter import filedialog
+from tkinter.ttk import Progressbar
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+import io
+import playsound
+import sys
+import time
+from threading import *
+import os
+from tkinter import messagebox
+desktop = expanduser("~/Documents")
+tr = Translater()
+tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9')
+tr.set_from_lang('en')
+tr.set_to_lang('ru')
+def chooseDirectory():
+    currdir = os.getcwd()
+    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
+    program.directory = tempdir
+class Scrapers(object):
+    def __init__(self):
+        self.thread1 = None
+        self.stop_threads = Event()
+        self.stopped = False
+        self.CloseLabel = Label(root, text = "Finalizing before breaking!")
+        self.directory = desktop
+        self.needToSkip = False
+    def waitandkill(self):
+        time.sleep(1)
+        if (self.stopped == True):
+            print("DEAD")
+        else:
+            self.waitandkill
+    def stopTheThread(self):
+        print("CALLED ME TOO?")
+        self.stop_threads.set()
+        self.CloseLabel.pack()
+        self.waitandkill
+        print("calling wait")
+    def skip(self):
+        self.needToSkip = True
+    def start_thread(self):
+        Skip = Button(topFrame, text = "SKIP!", command = self.skip)
+        Skip.pack(side = BOTTOM)
+        try:
+            f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
+        except IOError:
+            print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
+            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
+            sys.exit()
+        try:
+            f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
+        except IOError:
+            print("FILE ERROR!" + self.directory + "/News.txt")
+            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
+            sys.exit()
+        if self.thread1!=None:
+            print("NO!")
+        else:
+            self.thread1 = Thread(target = self.start_now)
+            self.thread1.start()
+            threadActive = 1
+    def start_now(self):
+        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
+        progress['value'] = 0
+        progress.pack(side = TOP) 
+        Labels = Label(topFrame, text = "SCRAPING")
+        Labels.pack(side = TOP)
+        texts = "change"
+        main_url = 'https://www.theverge.com/tech'
+        uClient = uReq(main_url)
+        page_html = uClient.read()
+        uClient.close()
+        page_soup = soup(page_html, "html.parser")
+        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
+        Articles = len(containers)
+        filename = self.directory + "/News.txt"
+        trans_filename = self.directory + "/TranslatedNews.txt"
+        f = io.open(filename, "w", encoding="utf-8")
+        f.write("ACTIVE")
+        t = io.open(trans_filename, "w", encoding ="utf-8")
+        t.write("ACTIVE")
+        Labels.config(text = "setting file!")
+        i = 0
+        CurrentTitle = Label(topFrame, text = "Preparing...")
+        CurrentTitle.pack(side = TOP)
+        for container in containers:
+          i = i + 1 
+          Labels.config(text = "jumping to URL!")
+          print(container["class"])
+          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
+             print("\n WE'VE CATCHED A BUG!")
+             continue
+          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
+             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
+             print(container.div["class"])
+             continue
+          progress['value'] = i * 100 / Articles
+          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
+          local_progress['value'] = 0
+          local_progress.pack(side = BOTTOM)
+          requiredURL = container.div.a["href"]
+          secondary_URL = requiredURL
+          print("Set target URL!")
+          secClient = uReq(secondary_URL)
+          news_html = secClient.read()
+          secClient.close()
+          news_soup = soup(news_html, "html.parser")
+          news_soup.decode('utf-8', 'ignore')
+          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
+          if len(news_containers)>0:
+             news_title = news_containers[0].h1.text
+             CurrentTitle.config(text = news_title)
+             Labels.config(text = "Extracted Title!")
+          else:
+             print("ERROR! NO TITLE AT "+secondary_URL)
+             Labels.config(text = "Failed to extract title")
+          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
+          print("\n TITLE: " + news_title)
+          f.write("\n \n" + news_title + "\n")
+          print("Now translating...")
+          tr.set_text(news_title)
+          translatedQuery = tr.translate()
+          t.write("\n \n" + translatedQuery + "\n")
+          paragraphs = news_body[0].findAll("p")
+          print("Title Recorded!")
+          local_progress['value'] = 10
+          y = len(paragraphs)
+          x = 0
+          fullText = ""
+          fullText2 = ""
+          for paragraph in paragraphs:
+
+               x = x + 1
+               local_progress['value'] = x * 100 / y + 10
+               stringx = str(x)         
+               Labels.config(text = "Getting paragraph " + stringx + "...")
+               print(paragraph.text + "\n \n \n")
+               if x >= y/2:
+                   fullText2 = fullText2 + paragraph.text.strip()
+               else:
+                   fullText = fullText + paragraph.text.strip()
+               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
+               print("Writing Paragraph " + stringx + "...")
+               if self.needToSkip:
+                break
+               
+          if self.needToSkip:
+            self.needToSkip = False
+            continue
+          tr.set_text((fullText))
+          translatedQuery = tr.translate()
+          completeText = translatedQuery
+          tr.set_text((fullText))
+          translatedQuery = tr.translate()
+          completeText = completeText + translatedQuery
+          f.write("\n" + fullText + fullText2)
+          t.write("\n" + completeText)
+          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
+          Labels.config(text = "Getting image...")
+          if news_picture[0].img != None:
+             article_pic = news_picture[0].img.get("src")
+             Labels.config(text = "Picture recieved!")
+          else:
+             print("\n THIS ARTICLE HAS NO PICTURE! ")
+             Labels.config(text = "Failed to locate picture :(")
+          local_progress['value'] = 120
+          f.write("\n PICTURE URL: " + article_pic)
+          t.write("\n PICTURE URL: " + article_pic)
+          if self.stop_threads.is_set():
+            print("I SURRENDER!")
+            self.stopped = True
+            f.close()
+            t.close()
+            self.CloseLabel.config(text = "you may close now")
+            sys.exit() 
+            self.CloseLabel.config(text = "I tried, I failed")
+            break
+          else:
+            print("NOTHING IS STOPPING ME!")
+            Labels.config(text = "Finished the article!")
+        #brand = divWithInfo.div.a.img["title"]
+        #title_container = divWithInfo.find("a", "item-title")
+        #product_name = title_container.text
+        #shipping_container = divWithInfo.find("li", "price-ship")
+        #shipping_cost = shipping_container.text.strip()
+
+        #print("brand:"+brand)
+        #print("name:"+product_name)
+        #print("shipping:"+shipping_cost)
+        #print("\n")
+
+        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
+        Labels.config(text = "All Done!")
+        f.close()
+        t.close()
+texts = "VERGE SCRAPPER"
+root = Tk()
+program = Scrapers()
+mainT = Thread(target=program.start_now)
+try:
+    texts
+except NameError:
+    theLabel = Label(root, text = "VERGE SCRAPER")
+    theLabel.pack()
+    print("NO TEXTS!")
+else:
+    theLabel = Label(root, text = texts)
+    theLabel.pack()
+    print("FOUND TEXTS!")
+
+stop_thread = False
+topFrame = Frame(root)
+topFrame.pack()
+bottomFrame = Frame(root)
+bottomFrame.pack(side=BOTTOM)
+button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
+button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
+button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
+button3.pack(side = TOP)
+button1.pack(side= TOP)
+button2.pack(side = TOP)
+root.mainloop()
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..4fb2767
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,6 @@
+#Hi, welcome to Verge Scrapper. A tool used for scraping the Verge's website for news and translating them into russian language. 
+
+
+To use the tool, just launch the .pyw executable with python3.
+
+<b>Warning! YANDEX.PYW USES YANDEX TRANSLATION API, WHILE GOOGLE.PYW USES GOOGLE TRANSLATE</b>
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..48de04d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,20 @@
+
+from distutils.core import setup
+
+setup(
+    name = 'mtranslate',
+    packages = ['mtranslate'],
+    version = '1.6',
+    description = 'Google translate console script with easy to use API',
+    author = 'Arnaud Alies',
+    author_email = 'arnaudalies.py@gmail.com',
+    url = 'https://github.com/mouuff/mtranslate',
+    download_url = 'https://github.com/mouuff/mtranslate/tarball/1.6',
+    keywords = ['console', 'translate', 'translator', 'simple', 'google', 'language'],
+    classifiers = [],
+    entry_points={
+          'console_scripts': [
+              'mtranslate = mtranslate.__main__:main'
+          ]
+      },
+)