diff --git a/.gitignore b/.gitignore
index a5c7feb..2d58e29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
/.vscode
.DS_Store
__pycache__
+/output/*
config.ini
vk_config.v2.json
\ No newline at end of file
diff --git a/main.py b/main.py
index 174cb19..3c2ebfc 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,5 @@
import configparser
+import argparse
import ast
import sys
from time import sleep
@@ -17,6 +18,19 @@ if config['SEARCH']['interests'] != "all":
else:
interests = "all"
+parser = argparse.ArgumentParser(description='Get documents from vk.com')
+parser.add_argument('-a', '--all', action='store_true',
+ help='Search through all availible documents instead of watching only the most recent uploads.')
+args = parser.parse_args()
+
+def update_status():
+ sys.stdout.write("\033[F")
+ sys.stdout.write("\033[K")
+ sys.stdout.write("\033[F")
+ sys.stdout.write("\033[K")
+ c.print(f'> Documents scanned {photos_processed}')
+ c.print(f'> Documents saved {photos_saved}')
+
c = Console()
c.print('[b]Dochunt[/b] starting...', style='yellow')
vk = authenticate()
@@ -25,7 +39,8 @@ queries = []
for i in range(len(terms)):
queries.append({
'string': terms[i],
- 'last_url': ''
+ 'last_url': '',
+ 'completed': False
})
photos_processed = 0
@@ -37,35 +52,35 @@ c.print(f'> Documents saved {photos_saved}')
while True:
try:
for query in queries:
- response = vk.docs.search(q=query['string'], count=1)
- image_url = response.popitem()[1][0]['url'] # WTF not readable
- image_url_clean = image_url.split('?')[0] # Get url without params
+ if args.all and not query['completed']:
+ response = vk.docs.search(q=query['string'], count=1000)
+ else:
+ response = vk.docs.search(q=query['string'], count=1)
+
+ pics_array = response.popitem()[1]
+ for pic in pics_array:
+ image_url = pic['url'] # WTF not readable
+ image_url_clean = image_url.split('?')[0] # Get url without params
- # If the image we are getting is new do stuff
- if image_url_clean != query['last_url']:
- photos_processed += 1
- query['last_url'] = image_url_clean
+ # If the image we are getting is new do stuff
+ if image_url_clean != query['last_url']:
+ photos_processed += 1
+ update_status()
+ query['last_url'] = image_url_clean
- text = detect(image_url+query['string'])
- if interests == "all":
- if not text.isspace():
- photos_saved += 1
- save_photo(image_url)
- else:
- for interest in interests:
- if interest in text:
+ text = detect(image_url+query['string'])
+ if interests == "all":
+ if not text.isspace():
photos_saved += 1
save_photo(image_url)
-
- sys.stdout.write("\033[F")
- sys.stdout.write("\033[K")
- sys.stdout.write("\033[F")
- sys.stdout.write("\033[K")
- c.print(f'> Documents scanned {photos_processed}')
- c.print(f'> Documents saved {photos_saved}')
+ else:
+ for interest in interests:
+ if interest in text:
+ photos_saved += 1
+ save_photo(image_url)
sleep(1)
except KeyboardInterrupt:
- c.print('Goodbye!', style='blue')
- sys.exit()
\ No newline at end of file
+ c.print(' Goodbye!', style='blue')
+ sys.exit()
diff --git a/utils/detect.py b/utils/detect.py
index 81486e0..e501c64 100644
--- a/utils/detect.py
+++ b/utils/detect.py
@@ -5,5 +5,9 @@ import requests
def detect(url):
response = requests.get(url)
- img = Image.open(io.BytesIO(response.content))
+ try:
+ img = Image.open(io.BytesIO(response.content))
+ except OSError:
+ return ''
+
return pytesseract.image_to_string(img, lang='rus', timeout=30).lower()
diff --git a/utils/savePhoto.py b/utils/savePhoto.py
index 3ce0022..8d7031e 100644
--- a/utils/savePhoto.py
+++ b/utils/savePhoto.py
@@ -1,8 +1,11 @@
from rich.console import Console
+import calendar;
+import time;
c = Console()
+ts = calendar.timegm(time.gmtime())
def save_photo(url):
- f = open("output.txt", "a", encoding='utf-8')
- f.write(url)
+ f = open(f"output/{ts}.html", "a", encoding='utf-8')
+ f.write(f'
')
f.close()
\ No newline at end of file