Mercurial > lbo > hg > photosync
changeset 14:b83a9e2622b3
Download photos more efficiently.
By batching media item lookup. Also migrated to httplib2 and improved logging.
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sat, 15 Jun 2019 10:49:09 +0200 |
parents | 8ac49bf5241f |
children | f941f1b0fa09 |
files | photosync.py |
diffstat | 1 files changed, 59 insertions(+), 32 deletions(-) [+] |
line wrap: on
line diff
--- a/photosync.py Fri Jun 14 23:45:35 2019 +0200 +++ b/photosync.py Sat Jun 15 10:49:09 2019 +0200 @@ -1,13 +1,15 @@ -import arguments import datetime -import dateutil.parser import json import os import os.path import pickle import sqlite3 -import urllib3 + +import arguments +import dateutil.parser +import httplib2 + from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request @@ -16,11 +18,13 @@ TRACE = True -def log(level, msg): +def log(level, msg, *args): if PROD: return if level == 'TRACE' and not TRACE: return + if args: + msg = msg.format(*args) print (level, "::", msg) @@ -67,7 +71,7 @@ def __init__(self, tokens=None): self._token_source = tokens self._service = build('photoslibrary', 'v1', credentials=tokens.creds()) - self._http = urllib3.PoolManager() + self._http = httplib2.Http() def list_library(self, start=None, to=None): """Yields items from the library. @@ -104,29 +108,43 @@ if pagetoken is None: return - def download_item(self, id, path, video): - """Download a item and store it under its file name in the directory `path`. - - First, the item is queried again in order to obtain the base URL (which - is not permanent). Then, the base URL is used to fetch the image/video - bytes. + def download_items(self, items): + """Download multiple items. Arguments: - id: Media ID of item. - path: Directory where to store it. - video: Boolean, whether item is video. + items: List of (id, path, video) tuples. + + Returns: + List of IDs that were successfully downloaded. """ - item = self._service.mediaItems().get(mediaItemId=id).execute() - rawurl = item['baseUrl'] - if video: - rawurl = '{url}=dv'.format(url=rawurl) - else: - rawurl = '{url}=d'.format(url=rawurl) - os.makedirs(path, exist_ok=True) - p = os.path.join(path, item['filename']) - with open(p, 'wb') as f: - f.write(self._http.request('GET', rawurl).data) - + ids = list(map(lambda i: i[0], items)) + media_items = self._service.mediaItems().batchGet(mediaItemIds=ids).execute() + ok = [] + i = -1 + for result in media_items['mediaItemResults']: + i += 1 + if 'status' in result: + log('WARN', 'Could not query info for {}: {}'.format(items[i][0], result['status'])) + continue + item = result['mediaItem'] + rawurl = item['baseUrl'] + if 'video' in item['mediaMetadata']: + rawurl += '=dv' + else: + rawurl += '=d' + os.makedirs(items[i][1], exist_ok=True) + p = os.path.join(items[i][1], item['filename']) + log('INFO', 'Downloading {}', p) + resp, cont = self._http.request(rawurl, 'GET') + if resp.status != 200: + log('WARN', 'HTTP item download failed: {} {}'.format(resp.status, resp.reason)) + continue + with open(p, 'wb') as f: + f.write(cont) + size = len(cont) / (1024. * 1024.) + log('INFO', 'Downloaded {} successfully ({:.2f} MiB)', p, size) + ok.append(item['id']) + return ok class DB: @@ -191,10 +209,11 @@ break yield row - def mark_item_downloaded(self, id): + def mark_items_downloaded(self, ids): with self._db as conn: - conn.cursor().execute('UPDATE items SET offline = 1 WHERE id = ?', (id,)) - self.record_transaction(id, 'DOWNLOAD') + for id in ids: + conn.cursor().execute('UPDATE items SET offline = 1 WHERE id = ?', (id,)) + self.record_transaction(id, 'DOWNLOAD') def existing_items_range(self): with self._db as conn: @@ -269,13 +288,21 @@ def download_items(self): """Scans database for items not yet downloaded and downloads them.""" + chunk = [] + chunksize = 16 for item in self._db.get_not_downloaded_items(): (id, path, filename, is_video) = item path = os.path.join(self._root, path) - log('INFO', 'Downloading {fn} into {p}'.format(fn=filename, p=path)) - self._svc.download_item(id, path, is_video) - log('INFO', 'Downloading {fn} successful'.format(fn=filename)) - self._db.mark_item_downloaded(id) + chunk.append((id, path, is_video)) + + if len(chunk) > chunksize: + ok = self._svc.download_items(chunk) + self._db.mark_items_downloaded(ok) + chunk = [] + + if len(chunk) > 0: + ok = self._svc.download_items(chunk) + self._db.mark_items_downloaded(ok) def drive(self, date_range=(None, None), window_heuristic=True): """First, download all metadata since most recently fetched item.