from library.models import Collection, Work, WorkMeta, Document import logging logger = logging.getLogger(__name__) def sync_work(work: Work): logger.info("Syncing '%s'", work.name) folder_id = work.meta_info.get(name="folderid").value storage = work.collection.storage.instance() prefix = work.collection.storage.name _, files = storage.listdir(folder_id) existing = set( [ storage.parse_id(x.partition(":")[2]) for x in work.docs.values_list("upload", flat=True) ] ) logger.debug("%d existing documents", len(existing)) for file in files: if file.id in existing: logger.debug("%30s: Skipping existing (%s)", file.name, file.id) existing.discard(file.id) continue if not file.name.lower().endswith(".pdf"): logger.debug("%40s: Not a PDF", file.name) continue logger.info("%40s: Adding", file.name) doc = work.docs.create(upload=f"{prefix}:{file}", doctype=Document.DOCTYPE_PDF) doc.auto_tag() for uri in existing: logger.warning("Local entry not in folder: %s", uri) def sync_collection(collection: Collection, sync_existing: bool = False): logger.info("Syncing '%s'", collection) if not collection.storage.storage.endswith("GDriveLinkStorage"): raise RuntimeError("Not a gdrive storage") if not collection.prefix: raise KeyError("Prefix must store folder id") existing = dict( WorkMeta.objects.filter( work__collection=collection, name="folderid" ).values_list("value", "work_id") ) storage = collection.storage.instance() folders, _ = storage.listdir(collection.prefix) for folder in folders: if folder.id in existing: if sync_existing: logger.info("%40s: Syncing (%s)", folder.name, folder.id[:12]) sync_work(Work.objects.get(pk=existing[folder.id])) del existing[folder.id] continue logger.info("%40s: Adding", folder.name) work = Work(name=folder.name, collection=collection) work.save() work.meta_info.create(name="folderid", value=folder.id) sync_work(work) for folderid, work in existing: logger.warning("Folder for work %d no longer in drive (%s)", work, folderid)