mirror of
https://github.com/anatolykopyl/dochunt.git
synced 2026-03-26 21:04:43 +00:00
🚸 Made output html instead of txt
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,6 +2,7 @@
|
|||||||
/.vscode
|
/.vscode
|
||||||
.DS_Store
|
.DS_Store
|
||||||
__pycache__
|
__pycache__
|
||||||
|
/output/*
|
||||||
|
|
||||||
config.ini
|
config.ini
|
||||||
vk_config.v2.json
|
vk_config.v2.json
|
||||||
65
main.py
65
main.py
@@ -1,4 +1,5 @@
|
|||||||
import configparser
|
import configparser
|
||||||
|
import argparse
|
||||||
import ast
|
import ast
|
||||||
import sys
|
import sys
|
||||||
from time import sleep
|
from time import sleep
|
||||||
@@ -17,6 +18,19 @@ if config['SEARCH']['interests'] != "all":
|
|||||||
else:
|
else:
|
||||||
interests = "all"
|
interests = "all"
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Get documents from vk.com')
|
||||||
|
parser.add_argument('-a', '--all', action='store_true',
|
||||||
|
help='Search through all availible documents instead of watching only the most recent uploads.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def update_status():
|
||||||
|
sys.stdout.write("\033[F")
|
||||||
|
sys.stdout.write("\033[K")
|
||||||
|
sys.stdout.write("\033[F")
|
||||||
|
sys.stdout.write("\033[K")
|
||||||
|
c.print(f'> Documents scanned {photos_processed}')
|
||||||
|
c.print(f'> Documents saved {photos_saved}')
|
||||||
|
|
||||||
c = Console()
|
c = Console()
|
||||||
c.print('[b]Dochunt[/b] starting...', style='yellow')
|
c.print('[b]Dochunt[/b] starting...', style='yellow')
|
||||||
vk = authenticate()
|
vk = authenticate()
|
||||||
@@ -25,7 +39,8 @@ queries = []
|
|||||||
for i in range(len(terms)):
|
for i in range(len(terms)):
|
||||||
queries.append({
|
queries.append({
|
||||||
'string': terms[i],
|
'string': terms[i],
|
||||||
'last_url': ''
|
'last_url': '',
|
||||||
|
'completed': False
|
||||||
})
|
})
|
||||||
|
|
||||||
photos_processed = 0
|
photos_processed = 0
|
||||||
@@ -37,35 +52,35 @@ c.print(f'> Documents saved {photos_saved}')
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
for query in queries:
|
for query in queries:
|
||||||
response = vk.docs.search(q=query['string'], count=1)
|
if args.all and not query['completed']:
|
||||||
image_url = response.popitem()[1][0]['url'] # WTF not readable
|
response = vk.docs.search(q=query['string'], count=1000)
|
||||||
image_url_clean = image_url.split('?')[0] # Get url without params
|
else:
|
||||||
|
response = vk.docs.search(q=query['string'], count=1)
|
||||||
|
|
||||||
|
pics_array = response.popitem()[1]
|
||||||
|
for pic in pics_array:
|
||||||
|
image_url = pic['url'] # WTF not readable
|
||||||
|
image_url_clean = image_url.split('?')[0] # Get url without params
|
||||||
|
|
||||||
# If the image we are getting is new do stuff
|
# If the image we are getting is new do stuff
|
||||||
if image_url_clean != query['last_url']:
|
if image_url_clean != query['last_url']:
|
||||||
photos_processed += 1
|
photos_processed += 1
|
||||||
query['last_url'] = image_url_clean
|
update_status()
|
||||||
|
query['last_url'] = image_url_clean
|
||||||
|
|
||||||
text = detect(image_url+query['string'])
|
text = detect(image_url+query['string'])
|
||||||
if interests == "all":
|
if interests == "all":
|
||||||
if not text.isspace():
|
if not text.isspace():
|
||||||
photos_saved += 1
|
|
||||||
save_photo(image_url)
|
|
||||||
else:
|
|
||||||
for interest in interests:
|
|
||||||
if interest in text:
|
|
||||||
photos_saved += 1
|
photos_saved += 1
|
||||||
save_photo(image_url)
|
save_photo(image_url)
|
||||||
|
else:
|
||||||
sys.stdout.write("\033[F")
|
for interest in interests:
|
||||||
sys.stdout.write("\033[K")
|
if interest in text:
|
||||||
sys.stdout.write("\033[F")
|
photos_saved += 1
|
||||||
sys.stdout.write("\033[K")
|
save_photo(image_url)
|
||||||
c.print(f'> Documents scanned {photos_processed}')
|
|
||||||
c.print(f'> Documents saved {photos_saved}')
|
|
||||||
|
|
||||||
sleep(1)
|
sleep(1)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
c.print('Goodbye!', style='blue')
|
c.print(' Goodbye!', style='blue')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|||||||
@@ -5,5 +5,9 @@ import requests
|
|||||||
|
|
||||||
def detect(url):
|
def detect(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
img = Image.open(io.BytesIO(response.content))
|
try:
|
||||||
|
img = Image.open(io.BytesIO(response.content))
|
||||||
|
except OSError:
|
||||||
|
return ''
|
||||||
|
|
||||||
return pytesseract.image_to_string(img, lang='rus', timeout=30).lower()
|
return pytesseract.image_to_string(img, lang='rus', timeout=30).lower()
|
||||||
|
|||||||
@@ -1,8 +1,11 @@
|
|||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
import calendar;
|
||||||
|
import time;
|
||||||
c = Console()
|
c = Console()
|
||||||
|
ts = calendar.timegm(time.gmtime())
|
||||||
|
|
||||||
def save_photo(url):
|
def save_photo(url):
|
||||||
f = open("output.txt", "a", encoding='utf-8')
|
f = open(f"output/{ts}.html", "a", encoding='utf-8')
|
||||||
f.write(url)
|
f.write(f'<img width=400 src="{url}"><br>')
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
Reference in New Issue
Block a user