🚸 Made output html instead of txt

This commit is contained in:
2021-04-06 18:24:54 +03:00
parent 1d88d0e936
commit 9107c3a0f4
4 changed files with 51 additions and 28 deletions

1
.gitignore vendored
View File

@@ -2,6 +2,7 @@
/.vscode
.DS_Store
__pycache__
/output/*
config.ini
vk_config.v2.json

35
main.py
View File

@@ -1,4 +1,5 @@
import configparser
import argparse
import ast
import sys
from time import sleep
@@ -17,6 +18,19 @@ if config['SEARCH']['interests'] != "all":
else:
interests = "all"
parser = argparse.ArgumentParser(description='Get documents from vk.com')
parser.add_argument('-a', '--all', action='store_true',
help='Search through all availible documents instead of watching only the most recent uploads.')
args = parser.parse_args()
def update_status():
sys.stdout.write("\033[F")
sys.stdout.write("\033[K")
sys.stdout.write("\033[F")
sys.stdout.write("\033[K")
c.print(f'> Documents scanned {photos_processed}')
c.print(f'> Documents saved {photos_saved}')
c = Console()
c.print('[b]Dochunt[/b] starting...', style='yellow')
vk = authenticate()
@@ -25,7 +39,8 @@ queries = []
for i in range(len(terms)):
queries.append({
'string': terms[i],
'last_url': ''
'last_url': '',
'completed': False
})
photos_processed = 0
@@ -37,13 +52,20 @@ c.print(f'> Documents saved {photos_saved}')
while True:
try:
for query in queries:
if args.all and not query['completed']:
response = vk.docs.search(q=query['string'], count=1000)
else:
response = vk.docs.search(q=query['string'], count=1)
image_url = response.popitem()[1][0]['url'] # WTF not readable
pics_array = response.popitem()[1]
for pic in pics_array:
image_url = pic['url'] # WTF not readable
image_url_clean = image_url.split('?')[0] # Get url without params
# If the image we are getting is new do stuff
if image_url_clean != query['last_url']:
photos_processed += 1
update_status()
query['last_url'] = image_url_clean
text = detect(image_url+query['string'])
@@ -57,15 +79,8 @@ while True:
photos_saved += 1
save_photo(image_url)
sys.stdout.write("\033[F")
sys.stdout.write("\033[K")
sys.stdout.write("\033[F")
sys.stdout.write("\033[K")
c.print(f'> Documents scanned {photos_processed}')
c.print(f'> Documents saved {photos_saved}')
sleep(1)
except KeyboardInterrupt:
c.print('Goodbye!', style='blue')
c.print(' Goodbye!', style='blue')
sys.exit()

View File

@@ -5,5 +5,9 @@ import requests
def detect(url):
response = requests.get(url)
try:
img = Image.open(io.BytesIO(response.content))
except OSError:
return ''
return pytesseract.image_to_string(img, lang='rus', timeout=30).lower()

View File

@@ -1,8 +1,11 @@
from rich.console import Console
import calendar;
import time;
c = Console()
ts = calendar.timegm(time.gmtime())
def save_photo(url):
f = open("output.txt", "a", encoding='utf-8')
f.write(url)
f = open(f"output/{ts}.html", "a", encoding='utf-8')
f.write(f'<img width=400 src="{url}"><br>')
f.close()