From d9793be893b17dff96a8a2d36dc1f7676a969005 Mon Sep 17 00:00:00 2001 From: Hank Grabowski Date: Sat, 15 Jan 2022 00:17:38 -0500 Subject: [PATCH] Add default start page and concept of restarting. --- bin/friendica_archiver.dart | 80 ++++++++++++++++++++++++++++++------- bin/image_archiver.dart | 6 +++ 2 files changed, 71 insertions(+), 15 deletions(-) diff --git a/bin/friendica_archiver.dart b/bin/friendica_archiver.dart index 83876bb..4082daf 100644 --- a/bin/friendica_archiver.dart +++ b/bin/friendica_archiver.dart @@ -1,3 +1,4 @@ +import 'dart:convert'; import 'dart:io'; import 'package:args/args.dart'; @@ -12,6 +13,7 @@ const defaultRequestDelayMilliseconds = 5000; const defaultMaxPostsQuery = 1000000000; const defaultItemsPerPage = 20; const defaultDownloadImages = true; +const defaultStartPage = 0; void main(List arguments) async { final argParser = _buildArgs(); @@ -39,12 +41,57 @@ void main(List arguments) async { final queryDelayMillis = int.parse(settings['delay']); final sleepDuration = Duration(milliseconds: queryDelayMillis); final itemsPerPage = int.parse(settings['items-per-page']); - final allEntries = []; + final firstPage = int.parse(settings['resume-page']); + final allEntries = {}; + final imageArchiveJsonFilePath = p.join(baseDirectory.path, 'images.json'); + final postsJsonFile = p.join(baseDirectory.path, 'postsAndComments.json'); print( "Max number of queries will be $maxQueries with $itemsPerPage items per page"); - for (var page = 0; page < maxQueries; page++) { + if (firstPage != 0) { + print( + "Starting page is not zero therefore attempting to load image and post/comment archives from disk"); + if (File(postsJsonFile).existsSync()) { + try { + final oldEntriesJson = + jsonDecode(File(postsJsonFile).readAsStringSync()) as List; + final oldEntries = + oldEntriesJson.map((j) => FriendicaEntry.fromJson(j)); + for (final entry in oldEntries) { + allEntries[entry.id] = entry; + } + print('Loading ${oldEntries.length} post/comment entries from disk'); + } catch (e) { + print( + 'Error loading old entries, will be starting from scratch file: $e'); + } + } else { + print( + 'Entries file did not exist at location therefore assuming starting from scratch: $postsJsonFile'); + } + + if (File(imageArchiveJsonFilePath).existsSync()) { + final oldEntriesJson = + jsonDecode(File(imageArchiveJsonFilePath).readAsStringSync()) + as List; + final oldEntries = oldEntriesJson.map((j) => ImageEntry.fromJson(j)); + for (final entry in oldEntries) { + final alreadyHadEntry = imageArchive.addDirectEntries(entry); + if (alreadyHadEntry) { + print("Image cache already had entry for: ${entry.url}"); + } + } + print('Loading ${oldEntries.length} image entries from disk'); + } else { + print( + 'Image archive file did not exist at location so assuming starting from scratch: $imageArchiveJsonFilePath'); + } + } + + print("Loading data from server"); + for (var page = firstPage; page < maxQueries; page++) { + print("Querying for posts/comments for $page"); final timelineResult = await client.getTimeline(username, page, itemsPerPage); if (timelineResult.isFailure) { @@ -53,9 +100,9 @@ void main(List arguments) async { } final entries = timelineResult.value; print('# Post/Comments returned for Page $page: ${entries.length}'); - allEntries.addAll(entries); - if (settings['download-images']) { - for (final entry in entries) { + for (final entry in entries) { + allEntries[entry.id] = entry; + if (settings['download-images']) { final imageEntryResults = await imageArchive.addEntryImages(entry); if (entry.images.isNotEmpty) { print( @@ -64,30 +111,28 @@ void main(List arguments) async { } } - if (entries.length != itemsPerPage) { - print( - 'Returned less than a full page, assuming at end of timeline and quiting'); - break; - } print("Sleeping for $queryDelayMillis milliseconds before next query"); - final postsJsonFile = p.join(baseDirectory.path, 'postsAndComments.json'); - final postsJson = allEntries.map((e) => e.originalJson).toList(); // Yes we are rewriting the entire file every time to preserve the results // over time. + final postsJson = allEntries.values.map((e) => e.originalJson).toList(); File(postsJsonFile) .writeAsStringSync(PrettyJsonEncoder().convert(postsJson)); print("Posts written to JSON file: $postsJsonFile"); if (settings['download-images']) { - final imageArchiveJsonFilePath = - p.join(baseDirectory.path, 'images.json'); File(imageArchiveJsonFilePath) .writeAsStringSync(PrettyJsonEncoder().convert(imageArchive.images)); print('Images directory saved to: $imageArchiveJsonFilePath'); } - sleep(sleepDuration); + if (entries.length != itemsPerPage) { + print( + 'Returned less than a full page, assuming at end of timeline and quiting'); + break; + } else { + sleep(sleepDuration); + } } return; @@ -106,6 +151,11 @@ ArgParser _buildArgs() => ArgParser() help: 'The server name for your instance. (e.g. if the URL in your browser is "https://friendica.com/" then this would be "friendica.com', mandatory: true) + ..addOption('resume-page', + abbr: 'r', + help: + 'The page to restart the downloading process. Will try to read in existing posts and image archive data and start download from there. If set to 0 it resets from scratch.', + defaultsTo: '$defaultStartPage') ..addOption('delay', abbr: 'd', help: diff --git a/bin/image_archiver.dart b/bin/image_archiver.dart index a15e220..f4fb2c7 100644 --- a/bin/image_archiver.dart +++ b/bin/image_archiver.dart @@ -21,6 +21,12 @@ class ImageArchiver { imageDirectory.createSync(recursive: true); } + bool addDirectEntries(ImageEntry entry) { + final alreadyExists = _images.containsKey(entry.url); + _images[entry.url] = entry; + return alreadyExists; + } + Future> addEntryImages(FriendicaEntry entry) async { final imageEntries = []; for (final imageUrl in entry.images) {