Added git repo

před 5 roky · adab35eb00
--- a/GOOGLE.pyw
+++ b/GOOGLE.pyw
@@ -0,0 +1,222 @@
 import tkinter
 from tkinter import *
 from os.path import expanduser
 from tkinter import filedialog
 from tkinter.ttk import Progressbar
 from urllib.request import urlopen as uReq
 from bs4 import BeautifulSoup as soup
 import io
 from mtranslate import translate
 import playsound
 import sys
 import time
 from threading import *
 import os
 from tkinter import messagebox
 desktop = expanduser("~/Documents")
 def chooseDirectory():
    currdir = os.getcwd()
    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
    program.directory = tempdir
 class Scrapers(object):
    def __init__(self):
        self.thread1 = None
        self.stop_threads = Event()
        self.stopped = False
        self.CloseLabel = Label(root, text = "Finalizing before breaking!")
        self.directory = desktop
        self.needToSkip = False
    def waitandkill(self):
        time.sleep(1)
        if (self.stopped == True):
            print("DEAD")
        else:
            self.waitandkill
    def stopTheThread(self):
        print("CALLED ME TOO?")
        self.stop_threads.set()
        self.CloseLabel.pack()
        self.waitandkill
        print("calling wait")
    def skip(self):
        self.needToSkip = True
    def start_thread(self):
        Skip = Button(topFrame, text = "SKIP!", command = self.skip)
        Skip.pack(side = BOTTOM)
        try:
            f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
            sys.exit()
        try:
            f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/News.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
            sys.exit()
        if self.thread1!=None:
            print("NO!")
        else:
            self.thread1 = Thread(target = self.start_now)
            self.thread1.start()
            threadActive = 1
    def start_now(self):
        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
        progress['value'] = 0
        progress.pack(side = TOP) 
        Labels = Label(topFrame, text = "SCRAPING")
        Labels.pack(side = TOP)
        texts = "change"
        main_url = 'https://www.theverge.com/tech'
        uClient = uReq(main_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
        Articles = len(containers)
        filename = self.directory + "/News.txt"
        trans_filename = self.directory + "/TranslatedNews.txt"
        f = io.open(filename, "w", encoding="utf-8")
        f.write("ACTIVE")
        t = io.open(trans_filename, "w", encoding ="utf-8")
        t.write("ACTIVE")
        Labels.config(text = "setting file!")
        i = 0
        CurrentTitle = Label(topFrame, text = "Preparing...")
        CurrentTitle.pack(side = TOP)
        for container in containers:
          i = i + 1 
          Labels.config(text = "jumping to URL!")
          print(container["class"])
          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
             print("\n WE'VE CATCHED A BUG!")
             continue
          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
             print(container.div["class"])
             continue
          progress['value'] = i * 100 / Articles
          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
          local_progress['value'] = 0
          local_progress.pack(side = BOTTOM)
          requiredURL = container.div.a["href"]
          secondary_URL = requiredURL
          print("Set target URL!")
          secClient = uReq(secondary_URL)
          news_html = secClient.read()
          secClient.close()
          news_soup = soup(news_html, "html.parser")
          news_soup.decode('utf-8', 'ignore')
          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
          if len(news_containers)>0:
             news_title = news_containers[0].h1.text
             CurrentTitle.config(text = news_title)
             Labels.config(text = "Extracted Title!")
          else:
             print("ERROR! NO TITLE AT "+secondary_URL)
             Labels.config(text = "Failed to extract title")
          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
          print("\n TITLE: " + news_title)
          f.write("\n \n" + news_title + "\n")
          print("Now translating...")
          translatedQuery = translate(news_title, "ru", "en")
          t.write("\n \n" + translatedQuery + "\n")
          paragraphs = news_body[0].findAll("p")
          print("Title Recorded!")
          local_progress['value'] = 10
          y = len(paragraphs)
          x = 0
          fullText = ""
          fullText2 = ""
          for paragraph in paragraphs:
               x = x + 1
               local_progress['value'] = x * 100 / y + 10
               stringx = str(x)         
               Labels.config(text = "Getting paragraph " + stringx + "...")
               print(paragraph.text + "\n \n \n")
               if x >= y/2:
                   fullText2 = fullText2 + paragraph.text.strip()
               else:
                   fullText = fullText + paragraph.text.strip()
               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
               print("Writing Paragraph " + stringx + "...")
               if self.needToSkip:
                break
          if self.needToSkip:
            self.needToSkip = False
            continue
          translatedQuery = translate(fullText, "ru", "en")
          completeText = translatedQuery
          translatedQuery = translate(fullText2, "ru", "en")
          completeText = completeText + translatedQuery
          f.write("\n" + fullText + fullText2)
          t.write("\n" + completeText)
          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
          Labels.config(text = "Getting image...")
          if news_picture[0].img != None:
             article_pic = news_picture[0].img.get("src")
             Labels.config(text = "Picture recieved!")
          else:
             print("\n THIS ARTICLE HAS NO PICTURE! ")
             Labels.config(text = "Failed to locate picture :(")
          local_progress['value'] = 120
          f.write("\n PICTURE URL: " + article_pic)
          t.write("\n PICTURE URL: " + article_pic)
          if self.stop_threads.is_set():
            print("I SURRENDER!")
            self.stopped = True
            f.close()
            t.close()
            self.CloseLabel.config(text = "you may close now")
            sys.exit() 
            self.CloseLabel.config(text = "I tried, I failed")
            break
          else:
            print("NOTHING IS STOPPING ME!")
            Labels.config(text = "Finished the article!")
        #brand = divWithInfo.div.a.img["title"]
        #title_container = divWithInfo.find("a", "item-title")
        #product_name = title_container.text
        #shipping_container = divWithInfo.find("li", "price-ship")
        #shipping_cost = shipping_container.text.strip()
        #print("brand:"+brand)
        #print("name:"+product_name)
        #print("shipping:"+shipping_cost)
        #print("\n")
        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
        Labels.config(text = "All Done!")
        f.close()
        t.close()
 texts = "VERGE SCRAPPER"
 root = Tk()
 program = Scrapers()
 mainT = Thread(target=program.start_now)
 try:
    texts
 except NameError:
    theLabel = Label(root, text = "VERGE SCRAPER")
    theLabel.pack()
    print("NO TEXTS!")
 else:
    theLabel = Label(root, text = texts)
    theLabel.pack()
    print("FOUND TEXTS!")
 stop_thread = False
 topFrame = Frame(root)
 topFrame.pack()
 bottomFrame = Frame(root)
 bottomFrame.pack(side=BOTTOM)
 button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
 button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
 button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
 button3.pack(side = TOP)
 button1.pack(side= TOP)
 button2.pack(side = TOP)
 root.mainloop()
--- a/YANDEX.pyw
+++ b/YANDEX.pyw
@@ -0,0 +1,229 @@
 import tkinter
 from tkinter import *
 from os.path import expanduser
 from yandex.Translater import Translater
 from tkinter import filedialog
 from tkinter.ttk import Progressbar
 from urllib.request import urlopen as uReq
 from bs4 import BeautifulSoup as soup
 import io
 import playsound
 import sys
 import time
 from threading import *
 import os
 from tkinter import messagebox
 desktop = expanduser("~/Documents")
 tr = Translater()
 tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9')
 tr.set_from_lang('en')
 tr.set_to_lang('ru')
 def chooseDirectory():
    currdir = os.getcwd()
    tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
    program.directory = tempdir
 class Scrapers(object):
    def __init__(self):
        self.thread1 = None
        self.stop_threads = Event()
        self.stopped = False
        self.CloseLabel = Label(root, text = "Finalizing before breaking!")
        self.directory = desktop
        self.needToSkip = False
    def waitandkill(self):
        time.sleep(1)
        if (self.stopped == True):
            print("DEAD")
        else:
            self.waitandkill
    def stopTheThread(self):
        print("CALLED ME TOO?")
        self.stop_threads.set()
        self.CloseLabel.pack()
        self.waitandkill
        print("calling wait")
    def skip(self):
        self.needToSkip = True
    def start_thread(self):
        Skip = Button(topFrame, text = "SKIP!", command = self.skip)
        Skip.pack(side = BOTTOM)
        try:
            f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
            sys.exit()
        try:
            f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
        except IOError:
            print("FILE ERROR!" + self.directory + "/News.txt")
            messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
            sys.exit()
        if self.thread1!=None:
            print("NO!")
        else:
            self.thread1 = Thread(target = self.start_now)
            self.thread1.start()
            threadActive = 1
    def start_now(self):
        progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
        progress['value'] = 0
        progress.pack(side = TOP) 
        Labels = Label(topFrame, text = "SCRAPING")
        Labels.pack(side = TOP)
        texts = "change"
        main_url = 'https://www.theverge.com/tech'
        uClient = uReq(main_url)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
        Articles = len(containers)
        filename = self.directory + "/News.txt"
        trans_filename = self.directory + "/TranslatedNews.txt"
        f = io.open(filename, "w", encoding="utf-8")
        f.write("ACTIVE")
        t = io.open(trans_filename, "w", encoding ="utf-8")
        t.write("ACTIVE")
        Labels.config(text = "setting file!")
        i = 0
        CurrentTitle = Label(topFrame, text = "Preparing...")
        CurrentTitle.pack(side = TOP)
        for container in containers:
          i = i + 1 
          Labels.config(text = "jumping to URL!")
          print(container["class"])
          if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
             print("\n WE'VE CATCHED A BUG!")
             continue
          if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
             print("\n WARNING! THIS IS NOT AN ARTICLE! ")
             print(container.div["class"])
             continue
          progress['value'] = i * 100 / Articles
          local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
          local_progress['value'] = 0
          local_progress.pack(side = BOTTOM)
          requiredURL = container.div.a["href"]
          secondary_URL = requiredURL
          print("Set target URL!")
          secClient = uReq(secondary_URL)
          news_html = secClient.read()
          secClient.close()
          news_soup = soup(news_html, "html.parser")
          news_soup.decode('utf-8', 'ignore')
          news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
          if len(news_containers)>0:
             news_title = news_containers[0].h1.text
             CurrentTitle.config(text = news_title)
             Labels.config(text = "Extracted Title!")
          else:
             print("ERROR! NO TITLE AT "+secondary_URL)
             Labels.config(text = "Failed to extract title")
          news_body = news_soup.findAll("div", {"class":"c-entry-content"})
          print("\n TITLE: " + news_title)
          f.write("\n \n" + news_title + "\n")
          print("Now translating...")
          tr.set_text(news_title)
          translatedQuery = tr.translate()
          t.write("\n \n" + translatedQuery + "\n")
          paragraphs = news_body[0].findAll("p")
          print("Title Recorded!")
          local_progress['value'] = 10
          y = len(paragraphs)
          x = 0
          fullText = ""
          fullText2 = ""
          for paragraph in paragraphs:
               x = x + 1
               local_progress['value'] = x * 100 / y + 10
               stringx = str(x)         
               Labels.config(text = "Getting paragraph " + stringx + "...")
               print(paragraph.text + "\n \n \n")
               if x >= y/2:
                   fullText2 = fullText2 + paragraph.text.strip()
               else:
                   fullText = fullText + paragraph.text.strip()
               Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
               print("Writing Paragraph " + stringx + "...")
               if self.needToSkip:
                break
          if self.needToSkip:
            self.needToSkip = False
            continue
          tr.set_text((fullText))
          translatedQuery = tr.translate()
          completeText = translatedQuery
          tr.set_text((fullText))
          translatedQuery = tr.translate()
          completeText = completeText + translatedQuery
          f.write("\n" + fullText + fullText2)
          t.write("\n" + completeText)
          news_picture = news_soup.findAll("picture", {"class":"c-picture"})
          Labels.config(text = "Getting image...")
          if news_picture[0].img != None:
             article_pic = news_picture[0].img.get("src")
             Labels.config(text = "Picture recieved!")
          else:
             print("\n THIS ARTICLE HAS NO PICTURE! ")
             Labels.config(text = "Failed to locate picture :(")
          local_progress['value'] = 120
          f.write("\n PICTURE URL: " + article_pic)
          t.write("\n PICTURE URL: " + article_pic)
          if self.stop_threads.is_set():
            print("I SURRENDER!")
            self.stopped = True
            f.close()
            t.close()
            self.CloseLabel.config(text = "you may close now")
            sys.exit() 
            self.CloseLabel.config(text = "I tried, I failed")
            break
          else:
            print("NOTHING IS STOPPING ME!")
            Labels.config(text = "Finished the article!")
        #brand = divWithInfo.div.a.img["title"]
        #title_container = divWithInfo.find("a", "item-title")
        #product_name = title_container.text
        #shipping_container = divWithInfo.find("li", "price-ship")
        #shipping_cost = shipping_container.text.strip()
        #print("brand:"+brand)
        #print("name:"+product_name)
        #print("shipping:"+shipping_cost)
        #print("\n")
        #f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
        Labels.config(text = "All Done!")
        f.close()
        t.close()
 texts = "VERGE SCRAPPER"
 root = Tk()
 program = Scrapers()
 mainT = Thread(target=program.start_now)
 try:
    texts
 except NameError:
    theLabel = Label(root, text = "VERGE SCRAPER")
    theLabel.pack()
    print("NO TEXTS!")
 else:
    theLabel = Label(root, text = texts)
    theLabel.pack()
    print("FOUND TEXTS!")
 stop_thread = False
 topFrame = Frame(root)
 topFrame.pack()
 bottomFrame = Frame(root)
 bottomFrame.pack(side=BOTTOM)
 button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
 button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
 button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
 button3.pack(side = TOP)
 button1.pack(side= TOP)
 button2.pack(side = TOP)
 root.mainloop()
--- a/readme.md
+++ b/readme.md
@@ -0,0 +1,6 @@
 #Hi, welcome to Verge Scrapper. A tool used for scraping the Verge's website for news and translating them into russian language. 
 To use the tool, just launch the .pyw executable with python3.
 <b>Warning! YANDEX.PYW USES YANDEX TRANSLATION API, WHILE GOOGLE.PYW USES GOOGLE TRANSLATE</b>
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,20 @@
 from distutils.core import setup
 setup(
    name = 'mtranslate',
    packages = ['mtranslate'],
    version = '1.6',
    description = 'Google translate console script with easy to use API',
    author = 'Arnaud Alies',
    author_email = 'arnaudalies.py@gmail.com',
    url = 'https://github.com/mouuff/mtranslate',
    download_url = 'https://github.com/mouuff/mtranslate/tarball/1.6',
    keywords = ['console', 'translate', 'translator', 'simple', 'google', 'language'],
    classifiers = [],
    entry_points={
          'console_scripts': [
              'mtranslate = mtranslate.__main__:main'
          ]
      },
 )