diff --git a/.gitignore b/.gitignore index a5c7feb..2d58e29 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /.vscode .DS_Store __pycache__ +/output/* config.ini vk_config.v2.json \ No newline at end of file diff --git a/main.py b/main.py index 174cb19..3c2ebfc 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import configparser +import argparse import ast import sys from time import sleep @@ -17,6 +18,19 @@ if config['SEARCH']['interests'] != "all": else: interests = "all" +parser = argparse.ArgumentParser(description='Get documents from vk.com') +parser.add_argument('-a', '--all', action='store_true', + help='Search through all availible documents instead of watching only the most recent uploads.') +args = parser.parse_args() + +def update_status(): + sys.stdout.write("\033[F") + sys.stdout.write("\033[K") + sys.stdout.write("\033[F") + sys.stdout.write("\033[K") + c.print(f'> Documents scanned {photos_processed}') + c.print(f'> Documents saved {photos_saved}') + c = Console() c.print('[b]Dochunt[/b] starting...', style='yellow') vk = authenticate() @@ -25,7 +39,8 @@ queries = [] for i in range(len(terms)): queries.append({ 'string': terms[i], - 'last_url': '' + 'last_url': '', + 'completed': False }) photos_processed = 0 @@ -37,35 +52,35 @@ c.print(f'> Documents saved {photos_saved}') while True: try: for query in queries: - response = vk.docs.search(q=query['string'], count=1) - image_url = response.popitem()[1][0]['url'] # WTF not readable - image_url_clean = image_url.split('?')[0] # Get url without params + if args.all and not query['completed']: + response = vk.docs.search(q=query['string'], count=1000) + else: + response = vk.docs.search(q=query['string'], count=1) + + pics_array = response.popitem()[1] + for pic in pics_array: + image_url = pic['url'] # WTF not readable + image_url_clean = image_url.split('?')[0] # Get url without params - # If the image we are getting is new do stuff - if image_url_clean != query['last_url']: - photos_processed += 1 - query['last_url'] = image_url_clean + # If the image we are getting is new do stuff + if image_url_clean != query['last_url']: + photos_processed += 1 + update_status() + query['last_url'] = image_url_clean - text = detect(image_url+query['string']) - if interests == "all": - if not text.isspace(): - photos_saved += 1 - save_photo(image_url) - else: - for interest in interests: - if interest in text: + text = detect(image_url+query['string']) + if interests == "all": + if not text.isspace(): photos_saved += 1 save_photo(image_url) - - sys.stdout.write("\033[F") - sys.stdout.write("\033[K") - sys.stdout.write("\033[F") - sys.stdout.write("\033[K") - c.print(f'> Documents scanned {photos_processed}') - c.print(f'> Documents saved {photos_saved}') + else: + for interest in interests: + if interest in text: + photos_saved += 1 + save_photo(image_url) sleep(1) except KeyboardInterrupt: - c.print('Goodbye!', style='blue') - sys.exit() \ No newline at end of file + c.print(' Goodbye!', style='blue') + sys.exit() diff --git a/utils/detect.py b/utils/detect.py index 81486e0..e501c64 100644 --- a/utils/detect.py +++ b/utils/detect.py @@ -5,5 +5,9 @@ import requests def detect(url): response = requests.get(url) - img = Image.open(io.BytesIO(response.content)) + try: + img = Image.open(io.BytesIO(response.content)) + except OSError: + return '' + return pytesseract.image_to_string(img, lang='rus', timeout=30).lower() diff --git a/utils/savePhoto.py b/utils/savePhoto.py index 3ce0022..8d7031e 100644 --- a/utils/savePhoto.py +++ b/utils/savePhoto.py @@ -1,8 +1,11 @@ from rich.console import Console +import calendar; +import time; c = Console() +ts = calendar.timegm(time.gmtime()) def save_photo(url): - f = open("output.txt", "a", encoding='utf-8') - f.write(url) + f = open(f"output/{ts}.html", "a", encoding='utf-8') + f.write(f'
') f.close() \ No newline at end of file