Browse Source

First commit

master
madiwka3 4 years ago
parent
commit
5f94966776
7 changed files with 68 additions and 627 deletions
  1. +0
    -375
      GOOGLE.pyw
  2. +0
    -229
      YANDEX.pyw
  3. +3
    -0
      dependencies.sh
  4. +26
    -0
      open.py
  5. +35
    -3
      readme.md
  6. +0
    -20
      setup.py
  7. +4
    -0
      spotitube.sh

+ 0
- 375
GOOGLE.pyw View File

@@ -1,375 +0,0 @@
import tkinter
from tkinter import *
from os.path import expanduser
from tkinter import filedialog
from tkinter.ttk import Progressbar
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import io
from mtranslate import translate
import playsound
import sys
import time
from threading import *
import os
from tkinter import messagebox
desktop = expanduser("~/Documents")
agency = "verge"
def chooseDirectory():
currdir = os.getcwd()
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
program.directory = tempdir
def switchAgencies(agencies):
print("called Agencies")
if agencies == "verge":
print("switching to techradar")
agencies = "techradar"
else:
print("switching to verge")
agencies = "verge"
button4['text'] = agencies
class Scrapers(object):
def __init__(self):
self.thread1 = None
self.stop_threads = Event()
self.stopped = False
self.CloseLabel = Label(root, text = "Finalizing before breaking!")
self.directory = desktop
self.needToSkip = False
def waitandkill(self):
time.sleep(1)
if (self.stopped == True):
print("DEAD")
else:
self.waitandkill
def stopTheThread(self):
print("CALLED ME TOO?")
self.stop_threads.set()
self.CloseLabel.pack()
self.waitandkill
print("calling wait")
def skip(self):
self.needToSkip = True
def start_thread(self):
Skip = Button(topFrame, text = "SKIP!", command = self.skip)
Skip.pack(side = BOTTOM)
try:
f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
except IOError:
print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
sys.exit()
try:
f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
except IOError:
print("FILE ERROR!" + self.directory + "/News.txt")
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
sys.exit()
if self.thread1!=None:
print("NO!")
else:
self.thread1 = Thread(target = self.start_now)
self.thread1.start()
threadActive = 1
def start_now(self):
print("Getting" + button4['text'])
if button4['text'] == "techradar":
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
progress['value'] = 0
progress.pack(side = TOP)
Labels = Label(topFrame, text = "SCRAPING")
Labels.pack(side = TOP)
texts = "change"
main_url = 'https://www.techradar.com/news'
uClient = uReq(main_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"listingResult"})
Articles = len(containers)
print(Articles)
filename = self.directory + "/News.txt"
trans_filename = self.directory + "/TranslatedNews.txt"
f = io.open(filename, "w", encoding="utf-8")
f.write("ACTIVE")
t = io.open(trans_filename, "w", encoding ="utf-8")
t.write("ACTIVE")
Labels.config(text = "setting file!")
i = 0
CurrentTitle = Label(topFrame, text = "Preparing...")
CurrentTitle.pack(side = TOP)
for container in containers:
i = i + 1
Labels.config(text = "jumping to URL!")
print(container["class"])
if 'sponsored-post' in container["class"]:
print("\n WE'VE CATCHED AN AD!")
continue
progress['value'] = i * 100 / Articles
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
local_progress['value'] = 0
local_progress.pack(side = BOTTOM)
requiredURL = container.a["href"]
secondary_URL = requiredURL
print("Set target URL!" + requiredURL)
secClient = uReq(secondary_URL)
news_html = secClient.read()
secClient.close()
news_soup = soup(news_html, "html.parser")
news_soup.decode('utf-8', 'ignore')
squash = news_soup.findAll("div",{"class":"icon-plus_circle"})
print(len(squash))
if len(squash)>0:
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
print(container.div["class"])
continue
news_containers = news_soup.findAll("header")
if len(news_containers)>0:
news_title = news_containers[0].h1.text
CurrentTitle.config(text = news_title)
Labels.config(text = "Extracted Title!")
else:
print("ERROR! NO TITLE AT "+secondary_URL)
Labels.config(text = "Failed to extract title")
news_body = news_soup.findAll("div", {"id":"article-body"})
print("\n TITLE: " + news_title)
f.write("\n \n" + news_title + "\n")
print("Now translating...")
translatedQuery = translate(news_title, "ru", "en")
t.write("\n \n" + translatedQuery + "\n")
paragraphs = news_body[0].findAll("p")
print("Title Recorded!")
local_progress['value'] = 10
y = len(paragraphs)
x = 0
fullText = ""
fullText2 = ""
for paragraph in paragraphs:
x = x + 1
local_progress['value'] = x * 100 / y + 10
stringx = str(x)
Labels.config(text = "Getting paragraph " + stringx + "...")
print(paragraph.text + "\n \n \n")
if x >= y/2:
fullText2 = fullText2 + paragraph.text.strip()
else:
fullText = fullText + paragraph.text.strip()
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
print("Writing Paragraph " + stringx + "...")
if self.needToSkip:
break
if self.needToSkip:
self.needToSkip = False
continue
translatedQuery = translate(fullText, "ru", "en")
completeText = translatedQuery
translatedQuery = translate(fullText2, "ru", "en")
completeText = completeText + translatedQuery
f.write("\n" + fullText + fullText2)
t.write("\n" + completeText)
news_picture = news_soup.findAll("source", {"class":"hero-image"})
Labels.config(text = "Getting image...")
if len(news_picture) > 0:
article_pic = news_picture[0].get("data-original-mos")
Labels.config(text = "Picture recieved!")
else:
print("\n THIS ARTICLE HAS NO PICTURE! ")
Labels.config(text = "Failed to locate picture :(")
local_progress['value'] = 120
f.write("\n PICTURE URL: " + article_pic)
t.write("\n PICTURE URL: " + article_pic)
if self.stop_threads.is_set():
print("I SURRENDER!")
self.stopped = True
f.close()
t.close()
self.CloseLabel.config(text = "you may close now")
sys.exit()
self.CloseLabel.config(text = "I tried, I failed")
break
else:
print("NOTHING IS STOPPING ME!")
Labels.config(text = "Finished the article!")
#brand = divWithInfo.div.a.img["title"]
#title_container = divWithInfo.find("a", "item-title")
#product_name = title_container.text
#shipping_container = divWithInfo.find("li", "price-ship")
#shipping_cost = shipping_container.text.strip()
#print("brand:"+brand)
#print("name:"+product_name)
#print("shipping:"+shipping_cost)
#print("\n")
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
Labels.config(text = "All Done!")
f.close()
t.close()
else:
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
progress['value'] = 0
progress.pack(side = TOP)
Labels = Label(topFrame, text = "SCRAPING")
Labels.pack(side = TOP)
texts = "change"
main_url = 'https://www.theverge.com/tech'
uClient = uReq(main_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
Articles = len(containers)
filename = self.directory + "/News.txt"
trans_filename = self.directory + "/TranslatedNews.txt"
f = io.open(filename, "w", encoding="utf-8")
f.write("ACTIVE")
t = io.open(trans_filename, "w", encoding ="utf-8")
t.write("ACTIVE")
Labels.config(text = "setting file!")
i = 0
CurrentTitle = Label(topFrame, text = "Preparing...")
CurrentTitle.pack(side = TOP)
for container in containers:
i = i + 1
Labels.config(text = "jumping to URL!")
print(container["class"])
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
print("\n WE'VE CATCHED A BUG!")
continue
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
print(container.div["class"])
continue
progress['value'] = i * 100 / Articles
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
local_progress['value'] = 0
local_progress.pack(side = BOTTOM)
requiredURL = container.div.a["href"]
secondary_URL = requiredURL
print("Set target URL!")
secClient = uReq(secondary_URL)
news_html = secClient.read()
secClient.close()
news_soup = soup(news_html, "html.parser")
news_soup.decode('utf-8', 'ignore')
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
if len(news_containers)>0:
news_title = news_containers[0].h1.text
CurrentTitle.config(text = news_title)
Labels.config(text = "Extracted Title!")
else:
print("ERROR! NO TITLE AT "+secondary_URL)
Labels.config(text = "Failed to extract title")
news_body = news_soup.findAll("div", {"class":"c-entry-content"})
print("\n TITLE: " + news_title)
f.write("\n \n" + news_title + "\n")
print("Now translating...")
translatedQuery = translate(news_title, "ru", "en")
t.write("\n \n" + translatedQuery + "\n")
paragraphs = news_body[0].findAll("p")
print("Title Recorded!")
local_progress['value'] = 10
y = len(paragraphs)
x = 0
fullText = ""
fullText2 = ""
for paragraph in paragraphs:
x = x + 1
local_progress['value'] = x * 100 / y + 10
stringx = str(x)
Labels.config(text = "Getting paragraph " + stringx + "...")
print(paragraph.text + "\n \n \n")
if x >= y/2:
fullText2 = fullText2 + paragraph.text.strip()
else:
fullText = fullText + paragraph.text.strip()
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
print("Writing Paragraph " + stringx + "...")
if self.needToSkip:
break
if self.needToSkip:
self.needToSkip = False
continue
translatedQuery = translate(fullText, "ru", "en")
completeText = translatedQuery
translatedQuery = translate(fullText2, "ru", "en")
completeText = completeText + translatedQuery
f.write("\n" + fullText + fullText2)
t.write("\n" + completeText)
news_picture = news_soup.findAll("picture", {"class":"c-picture"})
Labels.config(text = "Getting image...")
if news_picture[0].img != None:
article_pic = news_picture[0].img.get("src")
Labels.config(text = "Picture recieved!")
else:
print("\n THIS ARTICLE HAS NO PICTURE! ")
Labels.config(text = "Failed to locate picture :(")
local_progress['value'] = 120
f.write("\n PICTURE URL: " + article_pic)
t.write("\n PICTURE URL: " + article_pic)
if self.stop_threads.is_set():
print("I SURRENDER!")
self.stopped = True
f.close()
t.close()
self.CloseLabel.config(text = "you may close now")
sys.exit()
self.CloseLabel.config(text = "I tried, I failed")
break
else:
print("NOTHING IS STOPPING ME!")
Labels.config(text = "Finished the article!")
#brand = divWithInfo.div.a.img["title"]
#title_container = divWithInfo.find("a", "item-title")
#product_name = title_container.text
#shipping_container = divWithInfo.find("li", "price-ship")
#shipping_cost = shipping_container.text.strip()
#print("brand:"+brand)
#print("name:"+product_name)
#print("shipping:"+shipping_cost)
#print("\n")
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
Labels.config(text = "All Done!")
f.close()
t.close()
texts = "VERGE SCRAPPER"
root = Tk()
program = Scrapers()
mainT = Thread(target=program.start_now)
try:
texts
except NameError:
theLabel = Label(root, text = "VERGE SCRAPER")
theLabel.pack()
print("NO TEXTS!")
else:
theLabel = Label(root, text = texts)
theLabel.pack()
print("FOUND TEXTS!")
stop_thread = False
topFrame = Frame(root)
topFrame.pack()
bottomFrame = Frame(root)
bottomFrame.pack(side=BOTTOM)
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
button4 = Button(topFrame, text = agency, command = lambda: switchAgencies(button4['text']))
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
button3.pack(side = TOP)
button1.pack(side= TOP)
button4.pack(side= TOP)
button2.pack(side = TOP)
root.mainloop()

+ 0
- 229
YANDEX.pyw View File

@@ -1,229 +0,0 @@
import tkinter
from tkinter import *
from os.path import expanduser
from yandex.Translater import Translater
from tkinter import filedialog
from tkinter.ttk import Progressbar
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import io
import playsound
import sys
import time
from threading import *
import os
from tkinter import messagebox
desktop = expanduser("~/Documents")
tr = Translater()
tr.set_key('trnsl.1.1.20200525T143704Z.2789c3467e473787.8844abd61fe46dfedeef7f4f4a43082012802ae9')
tr.set_from_lang('en')
tr.set_to_lang('ru')
def chooseDirectory():
currdir = os.getcwd()
tempdir = filedialog.askdirectory(parent=root, initialdir=currdir, title='Please select a directory')
program.directory = tempdir
class Scrapers(object):
def __init__(self):
self.thread1 = None
self.stop_threads = Event()
self.stopped = False
self.CloseLabel = Label(root, text = "Finalizing before breaking!")
self.directory = desktop
self.needToSkip = False
def waitandkill(self):
time.sleep(1)
if (self.stopped == True):
print("DEAD")
else:
self.waitandkill
def stopTheThread(self):
print("CALLED ME TOO?")
self.stop_threads.set()
self.CloseLabel.pack()
self.waitandkill
print("calling wait")
def skip(self):
self.needToSkip = True
def start_thread(self):
Skip = Button(topFrame, text = "SKIP!", command = self.skip)
Skip.pack(side = BOTTOM)
try:
f = io.open(self.directory + "/TranslatedNews.txt", "w", encoding="utf-8")
except IOError:
print("FILE ERROR!" + self.directory + "/TranslatedNews.txt")
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/TranslatedNews.txt")
sys.exit()
try:
f = io.open(self.directory + "/News.txt", "w", encoding="utf-8")
except IOError:
print("FILE ERROR!" + self.directory + "/News.txt")
messagebox.showerror("ACCESS ERROR!", "We can't access "+ self.directory + "/News.txt")
sys.exit()
if self.thread1!=None:
print("NO!")
else:
self.thread1 = Thread(target = self.start_now)
self.thread1.start()
threadActive = 1
def start_now(self):
progress = Progressbar(topFrame, orient = HORIZONTAL, length = 100, mode = 'determinate')
progress['value'] = 0
progress.pack(side = TOP)
Labels = Label(topFrame, text = "SCRAPING")
Labels.pack(side = TOP)
texts = "change"
main_url = 'https://www.theverge.com/tech'
uClient = uReq(main_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"c-compact-river__entry"})
Articles = len(containers)
filename = self.directory + "/News.txt"
trans_filename = self.directory + "/TranslatedNews.txt"
f = io.open(filename, "w", encoding="utf-8")
f.write("ACTIVE")
t = io.open(trans_filename, "w", encoding ="utf-8")
t.write("ACTIVE")
Labels.config(text = "setting file!")
i = 0
CurrentTitle = Label(topFrame, text = "Preparing...")
CurrentTitle.pack(side = TOP)
for container in containers:
i = i + 1
Labels.config(text = "jumping to URL!")
print(container["class"])
if container["class"] == ['c-compact-river__entry', 'c-compact-river__entry--featured']:
print("\n WE'VE CATCHED A BUG!")
continue
if container.div["class"] != ["c-entry-box--compact", "c-entry-box--compact--article"]:
print("\n WARNING! THIS IS NOT AN ARTICLE! ")
print(container.div["class"])
continue
progress['value'] = i * 100 / Articles
local_progress = Progressbar(topFrame, orient = HORIZONTAL, length = 120, mode = 'determinate')
local_progress['value'] = 0
local_progress.pack(side = BOTTOM)
requiredURL = container.div.a["href"]
secondary_URL = requiredURL
print("Set target URL!")
secClient = uReq(secondary_URL)
news_html = secClient.read()
secClient.close()
news_soup = soup(news_html, "html.parser")
news_soup.decode('utf-8', 'ignore')
news_containers = news_soup.findAll("div", {"class":"c-entry-hero__header-wrap"})
if len(news_containers)>0:
news_title = news_containers[0].h1.text
CurrentTitle.config(text = news_title)
Labels.config(text = "Extracted Title!")
else:
print("ERROR! NO TITLE AT "+secondary_URL)
Labels.config(text = "Failed to extract title")
news_body = news_soup.findAll("div", {"class":"c-entry-content"})
print("\n TITLE: " + news_title)
f.write("\n \n" + news_title + "\n")
print("Now translating...")
tr.set_text(news_title)
translatedQuery = tr.translate()
t.write("\n \n" + translatedQuery + "\n")
paragraphs = news_body[0].findAll("p")
print("Title Recorded!")
local_progress['value'] = 10
y = len(paragraphs)
x = 0
fullText = ""
fullText2 = ""
for paragraph in paragraphs:
x = x + 1
local_progress['value'] = x * 100 / y + 10
stringx = str(x)
Labels.config(text = "Getting paragraph " + stringx + "...")
print(paragraph.text + "\n \n \n")
if x >= y/2:
fullText2 = fullText2 + paragraph.text.strip()
else:
fullText = fullText + paragraph.text.strip()
Labels.config(text = "Written and Translated Paragraph" + stringx + "!")
print("Writing Paragraph " + stringx + "...")
if self.needToSkip:
break
if self.needToSkip:
self.needToSkip = False
continue
tr.set_text((fullText))
translatedQuery = tr.translate()
completeText = translatedQuery
tr.set_text((fullText))
translatedQuery = tr.translate()
completeText = completeText + translatedQuery
f.write("\n" + fullText + fullText2)
t.write("\n" + completeText)
news_picture = news_soup.findAll("picture", {"class":"c-picture"})
Labels.config(text = "Getting image...")
if news_picture[0].img != None:
article_pic = news_picture[0].img.get("src")
Labels.config(text = "Picture recieved!")
else:
print("\n THIS ARTICLE HAS NO PICTURE! ")
Labels.config(text = "Failed to locate picture :(")
local_progress['value'] = 120
f.write("\n PICTURE URL: " + article_pic)
t.write("\n PICTURE URL: " + article_pic)
if self.stop_threads.is_set():
print("I SURRENDER!")
self.stopped = True
f.close()
t.close()
self.CloseLabel.config(text = "you may close now")
sys.exit()
self.CloseLabel.config(text = "I tried, I failed")
break
else:
print("NOTHING IS STOPPING ME!")
Labels.config(text = "Finished the article!")
#brand = divWithInfo.div.a.img["title"]
#title_container = divWithInfo.find("a", "item-title")
#product_name = title_container.text
#shipping_container = divWithInfo.find("li", "price-ship")
#shipping_cost = shipping_container.text.strip()
#print("brand:"+brand)
#print("name:"+product_name)
#print("shipping:"+shipping_cost)
#print("\n")
#f.write(brand + "," + product_name.replace(",", "|") + "," + shipping_cost + "\n")
Labels.config(text = "All Done!")
f.close()
t.close()
texts = "VERGE SCRAPPER"
root = Tk()
program = Scrapers()
mainT = Thread(target=program.start_now)
try:
texts
except NameError:
theLabel = Label(root, text = "VERGE SCRAPER")
theLabel.pack()
print("NO TEXTS!")
else:
theLabel = Label(root, text = texts)
theLabel.pack()
print("FOUND TEXTS!")
stop_thread = False
topFrame = Frame(root)
topFrame.pack()
bottomFrame = Frame(root)
bottomFrame.pack(side=BOTTOM)
button1 = Button(topFrame, text = "Start Scrapping!", command = program.start_thread)
button2 = Button(topFrame, text = "Choose Text Location", fg = "black", command = chooseDirectory)
button3 = Button(topFrame, text = "STOP!", fg = "red", command = program.stopTheThread)
button3.pack(side = TOP)
button1.pack(side= TOP)
button2.pack(side = TOP)
root.mainloop()

+ 3
- 0
dependencies.sh View File

@@ -0,0 +1,3 @@
sudo pacman -S python3 python-pip
pip install youtube-search
chmod +x spotitube.sh

+ 26
- 0
open.py View File

@@ -0,0 +1,26 @@
#!/usr/bin/python3
from youtube_search import YoutubeSearch
import sys, getopt

def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
except getopt.GetoptError:
print ('spotitube.py -i <Search Keyword>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print ('test.py -i <inputfile> -o <outputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
results = YoutubeSearch(inputfile, max_results=10).to_dict()
ans = 'https://youtu.be/' + results[0]['url_suffix'][9:]
print(ans)

if __name__ == "__main__":
main(sys.argv[1:])

+ 35
- 3
readme.md View File

@@ -1,6 +1,38 @@
#Hi, welcome to Verge Scrapper. A tool used for scraping the Verge's website for news and translating them into russian language.
# Spotitube

Basically, a tool to listen to music using YouTube search results.
# Instalation
FOR ARCH USERS ONLY STEP (SKIP IF OTHER OS):
```shell
cmod +x dependencies.sh
./dependencies.sh
```
This will install the required dependencies (skip next step)

To use the tool, just launch the .pyw executable with python3.
# Prerequisites

<b>Warning! YANDEX.PYW USES YANDEX TRANSLATION API, WHILE GOOGLE.PYW USES GOOGLE TRANSLATE</b>
The dependencies.sh already handles the dependencies, but if you want to do it manually, then here you go.


This app requires python3, pip3, and youtube-search python module to function properly:

Arch:
```shell
sudo pacman -S python3 python-pip
pip install youtube-search
```
Debian/ubuntu:
```shell
sudo apt install python3 pip3
pip3 install youtube-search
```

# Usage

All you have to do is launch spotitube.sh and give it a string argument with the song name!

```shell
./spotitube.sh "Song name"
```

Enjoy your music!

+ 0
- 20
setup.py View File

@@ -1,20 +0,0 @@

from distutils.core import setup

setup(
name = 'mtranslate',
packages = ['mtranslate'],
version = '1.6',
description = 'Google translate console script with easy to use API',
author = 'Arnaud Alies',
author_email = 'arnaudalies.py@gmail.com',
url = 'https://github.com/mouuff/mtranslate',
download_url = 'https://github.com/mouuff/mtranslate/tarball/1.6',
keywords = ['console', 'translate', 'translator', 'simple', 'google', 'language'],
classifiers = [],
entry_points={
'console_scripts': [
'mtranslate = mtranslate.__main__:main'
]
},
)

+ 4
- 0
spotitube.sh View File

@@ -0,0 +1,4 @@
va=$(python3 open.py -i "$1")
echo $1
echo "Opening $va"
mpv --no-video $va

Loading…
Cancel
Save