🚸 Made output html instead of txt

2026-06-14 22:43:55 +00:00 · 2021-04-06 18:24:54 +03:00
parent 1d88d0e936
commit 9107c3a0f4
4 changed files with 51 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /.vscode
 .DS_Store
 __pycache__
+/output/*

 config.ini
 vk_config.v2.json
--- a/main.py
+++ b/main.py
@@ -1,4 +1,5 @@
 import configparser
+import argparse 
 import ast
 import sys
 from time import sleep
@@ -17,6 +18,19 @@ if config['SEARCH']['interests'] != "all":
 else:
    interests = "all"

+parser = argparse.ArgumentParser(description='Get documents from vk.com')
+parser.add_argument('-a', '--all', action='store_true', 
+    help='Search through all availible documents instead of watching only the most recent uploads.')
+args = parser.parse_args()
+
+def update_status():
+    sys.stdout.write("\033[F")
+    sys.stdout.write("\033[K")
+    sys.stdout.write("\033[F")
+    sys.stdout.write("\033[K")
+    c.print(f'> Documents scanned {photos_processed}')
+    c.print(f'> Documents saved {photos_saved}')
+
 c = Console()
 c.print('[b]Dochunt[/b] starting...', style='yellow')
 vk = authenticate()
@@ -25,7 +39,8 @@ queries = []
 for i in range(len(terms)):
    queries.append({
        'string': terms[i],
-        'last_url': ''
+        'last_url': '',
+        'completed': False
    })

 photos_processed = 0
@@ -37,13 +52,20 @@ c.print(f'> Documents saved {photos_saved}')
 while True:
    try:
        for query in queries:
+            if args.all and not query['completed']:
+                response = vk.docs.search(q=query['string'], count=1000)
+            else:
                response = vk.docs.search(q=query['string'], count=1)
-            image_url = response.popitem()[1][0]['url'] # WTF not readable
+            
+            pics_array = response.popitem()[1]
+            for pic in pics_array:
+                image_url = pic['url'] # WTF not readable
                image_url_clean = image_url.split('?')[0] # Get url without params

                # If the image we are getting is new do stuff
                if image_url_clean != query['last_url']:
                    photos_processed += 1
+                    update_status()
                    query['last_url'] = image_url_clean

                    text = detect(image_url+query['string'])
@@ -57,13 +79,6 @@ while True:
                                photos_saved += 1
                                save_photo(image_url)

-            sys.stdout.write("\033[F")
-            sys.stdout.write("\033[K")
-            sys.stdout.write("\033[F")
-            sys.stdout.write("\033[K")
-            c.print(f'> Documents scanned {photos_processed}')
-            c.print(f'> Documents saved {photos_saved}')
-
            sleep(1)
    
    except KeyboardInterrupt:
--- a/utils/detect.py
+++ b/utils/detect.py
@@ -5,5 +5,9 @@ import requests

 def detect(url):
    response = requests.get(url)
+    try:
        img = Image.open(io.BytesIO(response.content))
+    except OSError:
+        return ''
+
    return pytesseract.image_to_string(img, lang='rus', timeout=30).lower()
--- a/utils/savePhoto.py
+++ b/utils/savePhoto.py
@@ -1,8 +1,11 @@
 from rich.console import Console
+import calendar;
+import time;
 c = Console()
+ts = calendar.timegm(time.gmtime())

 def save_photo(url):
-    f = open("output.txt", "a", encoding='utf-8')
-    f.write(url)
+    f = open(f"output/{ts}.html", "a", encoding='utf-8')
+    f.write(f'<img width=400 src="{url}"><br>')
    f.close()