Конкурс Python - Dump VK (фото, документы, диалоги+вложения)

gh0st4ge · 19.12.2018

Как обещал, прикрепляю результат моих трудов

Python:

import json
import shutil
from datetime import datetime
from itertools import repeat
from multiprocessing import Pool
from os import cpu_count, makedirs
from os.path import join, exists

import requests
import vk_api

from error_log import ErrorLog

INVALID_CHARS = ['\\', '/', ':', '*', '?', '<', '>', '|', '"']
error_log = ErrorLog()


def user_friendly_size(size):
    suffix_set = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
    level = 0
    while size > 1024:
        level += 1
        size = size / 1024
    suffix = suffix_set[level]
    if level != 0: size = f'{size:.2f}'
    return f'{size} {suffix}'


class LoginVK:
    API_VERSION = '5.92'
    APP_ID = 6781880

    def __init__(self, login_data):
        self.login_data = login_data
        self.vk = None
        self.vk_tools = None
        self.account = None
        self.login_vk()

    def login_vk(self):
        try:
            if 'token' in self.login_data and self.login_data['token']:
                token = self.login_data['token']
                vk_session = vk_api.VkApi(token=token, app_id=LoginVK.APP_ID, auth_handler=self.auth_handler,
                                          api_version=LoginVK.API_VERSION)
            elif 'login' in self.login_data and 'password' in self.login_data:
                login, password = self.login_data['login'], self.login_data['password']
                vk_session = vk_api.VkApi(login, password, captcha_handler=self.captcha_handler, app_id=LoginVK.APP_ID,
                                          api_version=LoginVK.API_VERSION, auth_handler=self.auth_handler)
                vk_session.auth(token_only=True, reauth=True)
            else:
                raise KeyError('Введите токен или пару логин-пароль')
            self.vk = vk_session.get_api()
            self.vk_tools = vk_api.VkTools(self.vk)
            self.account = self.vk.account.getProfileInfo()
        except Exception as e:
            raise ConnectionError(e)

    @staticmethod
    def auth_handler():
        key = input('Введите код двухфакторой аутентификации: ')
        remember_device = True
        return key, remember_device

    @staticmethod
    def captcha_handler(captcha):
        key = input(f"Введите капчу ({captcha.get_url()}): ").strip()
        return captcha.try_again(key)


class MethodsVK:
    def __init__(self, vk, vk_tools, account):
        self.vk = vk
        self.vk_tools = vk_tools
        self.account = account
        self.albums = []
        self.docs = []
        self.users = {}
        self.conversations = []
        self.get_albums()
        self.get_docs()
        self.get_conversations()

    def get_albums(self):
        for album in self.vk.photos.getAlbums(need_system=1)['items']:
            try:
                photos = self.vk_tools.get_all(values={'album_id': album['id'], 'photo_sizes': 1}, method='photos.get',
                                               max_count=1000)
                self.albums.append({'name': '_'.join(album['title'].split(' ')),
                                    'photos': [p['sizes'][-1]['url'] for p in photos['items']]})
            except Exception as e:
                error_log.add('get_albums', e)

    def get_docs(self):
        docs = self.vk.docs.get()
        self.docs = {'count': docs['count'], 'size': 0, 'docs': []}
        for doc in docs['items']:
            try:
                self.docs['docs'].append({'url': doc['url'], 'name': f"{doc['title']}_{doc['id']}.{doc['ext']}"})
                self.docs['size'] += doc['size']
            except Exception as e:
                error_log.add('get_docs', e)
        self.docs['size'] = user_friendly_size(self.docs['size'])

    def get_conversations(self):
        conversations = self.vk_tools.get_all(method='messages.getConversations', max_count=100,
                                              values={'extended': 1, 'fields': 'first_name, last_name, name'})
        for conversation in conversations['items']:
            self.conversations.append(self.get_dialog_data(conversation))

    def get_dialog_data(self, conversation):
        dialog = {'id': conversation['conversation']['peer']['id'],
                  'type': conversation['conversation']['peer']['type']}
        if dialog['type'] in ['user', 'group']:
            self.users_add(dialog['id'])
            dialog['name'] = self.users[abs(dialog['id'])]
        elif dialog['type'] == 'chat':
            dialog['name'] = conversation['conversation']['chat_settings']['title']
        else:
            dialog['name'] = r'{unknown}'
        for c in INVALID_CHARS:
            dialog['name'] = dialog['name'].replace(c, '_')
        dialog['messages'], dialog['attachments'] = self.get_messages(dialog['id'])
        return dialog

    def users_add(self, uid):
        try:
            if uid not in self.users:
                if uid > 0:
                    user = self.vk.users.get(user_ids=uid)[0]
                    if ('deactivated' in user) and (user['deactivated'] == 'deleted') and (
                            user['first_name'] == 'DELETED'):
                        name = 'DELETED'
                    else:
                        name = f"{user['first_name']} {user['last_name']}"
                else:
                    group = self.vk.messages.getConversationsById(peer_ids=uid, extended=1)['groups'][0]
                    name = group['name']
                self.users[abs(uid)] = name
        except:
            self.users[abs(uid)] = '{unknown user}'

    def get_messages(self, cid):
        messages = []
        attachments = {}
        history = self.vk_tools.get_all(max_count=200, method='messages.getHistory',
                                        values={'rev': 1, 'extended': 1, 'fields': 'first_name, last_name',
                                                'peer_id': cid})
        for message in history['items']:
            try:
                if message['from_id'] not in self.users:
                    self.users_add(message['from_id'])
                mess = self.message_handler(message)
                if mess:
                    messages.append(mess)
                attach = self.attach_handler(message)
                for at_type in attach:
                    if at_type not in attachments:
                        attachments[at_type] = []
                    attachments[at_type] += attach[at_type]
            except Exception as e:
                error_log.add('get_messages', e)
        return messages, attachments

    def message_handler(self, msg):
        if len(msg['text']) > 0:
            text = [line for line in msg['text'].split('\n') if line]
            return {'text': text, 'from': self.users.get(msg['from_id']),
                    'date': str(datetime.fromtimestamp(msg['date']))}
        return None

    @staticmethod
    def attach_handler(msg):
        attachments = {}
        for attach in msg['attachments']:
            try:
                at_type = attach['type']
                at = attach[at_type]
                if at_type not in attachments and at_type in ['photo', 'doc', 'audio_message']:
                    attachments[at_type] = []
                if at_type == 'photo':
                    attachments[at_type].append(at['sizes'][-1]['url'])
                elif at_type == 'doc':
                    attachments[at_type].append({'url': at['url'], 'access_key': at['access_key'],
                                                 'size': user_friendly_size(at['size']),
                                                 'name': f"{at['title']}_{at['id']}.{at['ext']}",
                                                 'date': str(datetime.fromtimestamp(at['date']))})
                elif at_type == 'audio_message':
                    attachments[at_type].append({'url': at['link_ogg'], 'access_key': at['access_key'],
                                                 'name': f"{at['owner_id']}_{at['id']}.oog"})
            except Exception as e:
                error_log.add('attach_handler', e)
        return attachments


class DownloadManager:
    PULL_PROCESSES = 4 * cpu_count()

    @staticmethod
    def download_engine(object_list, path):
        makedirs(path, exist_ok=True)
        with Pool(DownloadManager.PULL_PROCESSES) as pool:
            results = pool.starmap(DownloadManager.download, zip(object_list, repeat(path)))
            # возвращаем список с ошибками
        return [file for file in results if file is not True]

    @staticmethod
    def download(obj, root):
        reserve_obj = obj
        try:
            filename, url = DownloadManager.get_filename_url(obj)
            path = join(root, filename)
            if not exists(path):
                r = requests.get(url, stream=True, timeout=(30, 5))
                with open(path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)
            return True
        except Exception as e:
            error_log.add('download', e)
            return {'message': str(e), 'obj': reserve_obj}

    @staticmethod
    def get_filename_url(obj):
        if not obj:
            raise ValueError('Объект пустой')
        if isinstance(obj, str):
            url = obj
            filename = url.split('/')[-1]
        elif isinstance(obj, dict):
            url = obj['url']
            options = obj
            filename, url = DownloadManager.update_path(url, options=options)
        else:
            raise ValueError('Неизвестный объект')
        for char in INVALID_CHARS:
            filename = filename.replace(char, '_')
        return filename, url

    @staticmethod
    def update_path(url, filename=None, options=None):
        if options is None:
            options = {}
        if 'name' in options:
            filename = '_'.join(options['name'].split(' '))
        if 'access_key' in options:
            url = f"{url}?access_key={options['access_key']}"
        return filename, url


class DumpVK:
    def __init__(self, config):
        self.account = self.get_from_config(config, 'account')
        self.users = self.get_from_config(config, 'users')
        self.albums = self.get_from_config(config, 'albums') or []
        self.docs = self.get_from_config(config, 'docs') or []
        self.conversations = self.get_from_config(config, 'conversations') or []
        self.path = self.get_from_config(config, 'path') or 'dump'
        self.download = self.get_from_config(config, 'download')
        self.download_errors_path = config['download_errors']
        self.errors = []

    @staticmethod
    def get_from_config(config, parameter):
        if parameter in config:
            return config[parameter]
        else:
            return None

    def dump(self):
        self.dump_info(self.path)
        self.dump_albums(self.albums, join(self.path, 'photos'), self.download)
        self.dump_docs(self.docs, join(self.path, 'docs'), self.download)
        self.dump_dialogs(self.conversations, join(self.path, 'dialogs'), self.download)
        if self.errors:
            with open(self.download_errors_path, 'w') as download_errors_file:
                json.dump(self.errors, download_errors_file)

    def dump_albums(self, albums, path, download=True):
        makedirs(path, exist_ok=True)
        with open(join(path, 'albums.json'), 'w') as alb_file:
            json.dump(albums, alb_file)
        if download:
            for album in albums:
                current_path = join(path, album['name'])
                self.errors += DownloadManager.download_engine(album['photos'], current_path)

    def dump_info(self, path):
        makedirs(path, exist_ok=True)
        if self.account:
            with open(join(path, 'account.json'), 'w') as acc_file:
                json.dump(self.account, acc_file)
        if self.users:
            with open(join(path, 'users.json'), 'w') as users_file:
                json.dump(self.users, users_file)

    def dump_docs(self, docs, path, download=True):
        makedirs(path, exist_ok=True)
        with open(join(path, 'docs.json'), 'w') as doc_file:
            json.dump(docs, doc_file)
        if download and 'docs' in docs:
            self.errors += DownloadManager.download_engine(docs['docs'], path)

    def dump_dialogs(self, dialogs, path, download=True):
        makedirs(path, exist_ok=True)
        with open(f"{join(path,'conversations')}.json", 'w') as dialog_json:
            json.dump(dialogs, dialog_json)
        if download:
            for dialog in dialogs:
                dialog_name = f"{dialog['name'].replace(' ', '_')}_{dialog['id']}"
                current_path = join(path, dialog_name)
                makedirs(current_path, exist_ok=True)
                with open(f'{join(current_path,dialog_name)}.json', 'w') as dialog_json:
                    json.dump(dialog, dialog_json)
                DumpVK.save_conversation_txt(dialog['messages'], f'{join(current_path,dialog_name)}.txt')
                for mode in ['photo', 'doc', 'audio_message']:
                    try:
                        if mode in dialog['attachments']:
                            attachment_path = join(current_path, mode)
                            self.errors += DownloadManager.download_engine(dialog['attachments'][mode], attachment_path)
                    except Exception as e:
                        error_log.add('dump_dialogs', e)

    @staticmethod
    def save_conversation_txt(messages, filename):
        with open(filename, 'w', encoding='utf8') as dialog_file:
            previous = None
            for message in messages:
                try:
                    text = '\n'.join(message["text"])
                    if message["from"]:
                        name = f'{message["from"]}:'
                    else:
                        name = 'unknown_user'
                    if name == previous:
                        current = ' ' * (len(name) + 2)
                    else:
                        current = name
                    text_message = f'{current}\t{text}\t[{message["date"]}]'
                    dialog_file.write(f'{text_message}\n')
                    previous = name
                except Exception as e:
                    error_log.add('save_conversation_txt', e)


def dump_manager(config):
    dump_config = config['dump_config']

    if config['mode'] == 'collect':
        vk = collect(config['login_data'])
        dump_config.update({'albums': vk.albums, 'docs': vk.docs, 'conversations': vk.conversations, 'users': vk.users,
                            'account': vk.account})
    elif config['mode'] == 'dump':
        dump_config.update({'albums': load_collection(dump_config['albums']),
                            'docs': load_collection(dump_config['docs']),
                            'conversations': load_collection(dump_config['conversations'])})
    elif config['mode'] == 'redump_errors':
        try_dump_error_list(dump_config['path'], dump_config['download_errors'])
    if config['mode'] in ['collect', 'dump']:
        dump_vk = DumpVK(dump_config)
        dump_vk.dump()


def collect(config):
    login_vk = LoginVK(config)
    return MethodsVK(login_vk.vk, login_vk.vk_tools, login_vk.account)


def load_collection(filename):
    try:
        with open(filename, 'r') as collection_file:
            return json.load(collection_file)
    except:
        return []


def try_dump_error_list(path, filename):
    with open(filename, 'r') as errors_file:
        error_list = json.load(errors_file)
    dump_list = [error['obj'] for error in error_list]
    errors = DownloadManager.download_engine(dump_list, join(path, 'errors'))
    with open(filename, 'w') as download_errors_file:
        json.dump(errors, download_errors_file)


if __name__ == '__main__':
    try:
        with open('config.json', 'r') as config_file:
            vk_config = json.load(config_file)
        dump_manager(vk_config)
    except Exception as e:
        error_log.add(__name__, e)
    error_log.save_log('error.log')

Сергей Попов · 19.12.2018

Обращение ко всем авторам: одним из условий участия в конкурсе является

WebWare Team сказал(а):
Размести ссылку на опубликованную статью в комментарии к анонсу конкурса

jess_hide · 19.12.2018

Да ты крут))

hikiko4ern · 20.12.2018

Что-то мне подсказывает, что твой код что-то уж сильно похож на этот хд

всё понимаю, но копипаст/небольшое_изменение исходников, коим является 66% твоего кода - это уж слишком ᕙ(⇀‸↼‶)ᕗ

r4z0r5 · 22.12.2018

Хорошая статья, не придется вникать в api.vk.com и developers. Сам писал подобное, очень эффективный инструмент. Код п**дить не хорошо конечно, но ты скомпилировал в одном месте информацию.

dob · 23.12.2018

Почему у меня в json файлах диалогов вместо букв \u043d\u0430 \u041a\u0430\u0440\u0442\u0443 ?

hikiko4ern · 23.12.2018

dob сказал(а):
Почему у меня в json файлах диалогов вместо букв \u043d\u0430 \u041a\u0430\u0440\u0442\u0443 ?

Решение на

Ссылка скрыта от гостей

Исправил vk.py из source.zip ОПа

codefather_ru · 02.03.2019

gh0st4ge сказал(а):
Python:
vk_session = vk_api.VkApi(login, password, app_id=APP_ID, api_version=API_VERSION)
vk_session.auth(token_only=True, reauth=True)

попытка входа по логину\паролю дает "vk_api.exceptions.AuthError: API auth error: This application has no right to use messages"

gh0st4ge сказал(а):
token = '' # логинимся по токену
vk = login_vk(token)

этот вариант кода так же не работает.
а если заменить на логин\пароль - получим vk_api.exceptions.ApiError: [5] User authorization failed: invalid access_token (4).

hikiko4ern · 04.03.2019

codefather_ru сказал(а):
попытка входа по логину\паролю дает "vk_api.exceptions.AuthError: API auth error: This application has no right to use messages"

этот вариант кода так же не работает.
а если заменить на логин\пароль - получим vk_api.exceptions.ApiError: [5] User authorization failed: invalid access_token (4).

админы прикрыли доступ к апи сообщений. Как вариант - юзать токен от оф. или прошедших модерацию приложений.

Набросал как-то ~~кривой~~ скрипт, он получает токен, прикидываясь приложением для ведра. Надо лишь логин и пароль вписать в самом коде.

taxumicore · 28.04.2019

hikiko4ern сказал(а):
админы прикрыли доступ к апи сообщений. Как вариант - юзать токен от оф. или прошедших модерацию приложений.

Набросал как-то ~~кривой~~ скрипт, он получает токен, прикидываясь приложением для ведра. Надо лишь логин и пароль вписать в самом коде.

не компилится
почему?

Код:

Microsoft Windows [Version 6.1.7601]
Copyright (c) 2009 Microsoft Corporation.  All rights reserved.

C:\Users\Administrator>C:\Users\Administrator\Downloads\auth\auth.py
Traceback (most recent call last):
  File "C:\Users\Administrator\Downloads\auth\auth.py", line 2, in <module>
    import requests
ModuleNotFoundError: No module named 'requests'

C:\Users\Administrator>

hikiko4ern · 28.04.2019

taxumicore сказал(а):
не компилится
почему?

потому что не установлены зависимости?
не просто так ведь там написано No module named 'requests'

taxumicore · 28.04.2019

hikiko4ern сказал(а):
потому что не установлены зависимости?
не просто так ведь там написано No module named 'requests'

установил модуль request. теперь новая ошибка

Код:

Microsoft Windows [Version 6.1.7601]
Copyright (c) 2009 Microsoft Corporation.  All rights reserved.

C:\Users\Administrator>C:\Users\Administrator\Downloads\auth\auth.py
Traceback (most recent call last):
  File "C:\Users\Administrator\Downloads\auth\auth.py", line 13, in <module>
    r2 = requests.get(t1['redirect_uri'])
KeyError: 'redirect_uri'

C:\Users\Administrator>

hikiko4ern · 29.04.2019

taxumicore сказал(а):
установил модуль request. теперь новая ошибка

hikiko4ern сказал(а):
Надо лишь логин и пароль вписать в самом коде.

Nickolass · 29.04.2019

Выдает ошибку в собранном билде файла error.txt

__main__ - list indices must be integers or slices, not str

DwY · 18.11.2020

Есть у кого-нибудь дампер сообщений, который создает html документ?
Что-то вроде такого?

Все сервисы Codeby

Поиск

Поиск

Конкурс Python - Dump VK (фото, документы, диалоги+вложения)

Вложения

gh0st4ge

Сергей Попов

jess_hide

hikiko4ern

r4z0r5

dob

hikiko4ern

Вложения

codefather_ru

hikiko4ern

Вложения

taxumicore

hikiko4ern

taxumicore

hikiko4ern

Nickolass

DwY

New member