mirror of
https://gitlab.com/mysocialportal/fediverse-archiving-tools.git
synced 2024-10-18 08:53:31 +00:00
Add image importing and looping over pages
This commit is contained in:
parent
0302fa2cd0
commit
1cd010ea02
8 changed files with 281 additions and 67 deletions
|
@ -7,5 +7,6 @@ class ExecError {
|
|||
|
||||
enum ErrorType {
|
||||
authentication,
|
||||
localError,
|
||||
missingEndpoint,
|
||||
}
|
||||
|
|
15
bin/extensions.dart
Normal file
15
bin/extensions.dart
Normal file
|
@ -0,0 +1,15 @@
|
|||
extension ListEqualityTest<T> on List<T> {
|
||||
bool equals(List<T> list2) {
|
||||
if (length != list2.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (var i = 0; i < length; i++) {
|
||||
if (this[i] != list2[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -1,48 +1,20 @@
|
|||
import 'dart:io';
|
||||
|
||||
import 'package:args/args.dart';
|
||||
import 'package:path/path.dart' as p;
|
||||
|
||||
import 'friendica_client.dart';
|
||||
import 'image_archiver.dart';
|
||||
import 'json_printer.dart';
|
||||
import 'models.dart';
|
||||
|
||||
const defaultRequestDelayMilliseconds = 5000;
|
||||
const defaultMaxPosts = 1;
|
||||
const defaultMaxPostsQuery = 10;
|
||||
const defaultReadComments = false;
|
||||
const defaultReadImages = false;
|
||||
|
||||
void main(List<String> arguments) async {
|
||||
final argParser = ArgParser()
|
||||
..addOption('archive-folder',
|
||||
abbr: 'a',
|
||||
help:
|
||||
'Specifies the local folder all data files pulled from the server will be stored',
|
||||
mandatory: true)
|
||||
..addOption('username',
|
||||
abbr: 'u', help: 'Username on your Friendica instance', mandatory: true)
|
||||
..addOption('server-name',
|
||||
abbr: 's',
|
||||
help:
|
||||
'The server name for your instance. (e.g. if the URL in your browser is "https://friendica.com/" then this would be "friendica.com',
|
||||
mandatory: true)
|
||||
..addOption('delay',
|
||||
abbr: 'd',
|
||||
help:
|
||||
'Delay in milliseconds between requests to try not to stress the server (thousands of API calls can be made)',
|
||||
defaultsTo: '$defaultRequestDelayMilliseconds')
|
||||
..addOption('max-post-requests',
|
||||
abbr: 'm',
|
||||
help: 'The maximum number of times to query for posts',
|
||||
defaultsTo: '$defaultMaxPosts')
|
||||
..addFlag('read-comments',
|
||||
abbr: 'c',
|
||||
help:
|
||||
'Whether to read comments on posts (defaults to $defaultReadComments)',
|
||||
defaultsTo: defaultReadComments)
|
||||
..addFlag('download-images',
|
||||
abbr: 'i',
|
||||
help:
|
||||
'Whether to download images from posts when those images are stored on the server (not links to other sites) (defaults to $defaultReadImages)',
|
||||
defaultsTo: defaultReadComments);
|
||||
final argParser = _buildArgs();
|
||||
|
||||
late ArgResults settings;
|
||||
try {
|
||||
|
@ -53,10 +25,7 @@ void main(List<String> arguments) async {
|
|||
return;
|
||||
}
|
||||
|
||||
stdout.write('Enter Password: ');
|
||||
_setEcho(false);
|
||||
final password = stdin.readLineSync() ?? '';
|
||||
_setEcho(true);
|
||||
print('');
|
||||
|
||||
final username = settings['username'];
|
||||
|
@ -64,25 +33,81 @@ void main(List<String> arguments) async {
|
|||
username: username,
|
||||
password: password,
|
||||
serverName: settings['server-name']);
|
||||
final timelineResult = await client.getTimeline(username, 1, 20);
|
||||
timelineResult.match(
|
||||
onSuccess: (posts) {
|
||||
posts.forEach(print);
|
||||
File('/tmp/test.json').writeAsStringSync(PrettyJsonEncoder()
|
||||
.convert(posts.map((p) => p.originalJson).toList()));
|
||||
},
|
||||
onError: (error) => print('Error getting posts: $error'));
|
||||
print("Done processing API requests");
|
||||
final baseDirectory = Directory(settings['archive-folder']);
|
||||
final imageArchive = ImageArchiver(client, baseDirectory);
|
||||
final maxQueries = int.parse(settings['max-post-requests']);
|
||||
final queryDelayMillis = int.parse(settings['delay']);
|
||||
final sleepDuration = Duration(milliseconds: queryDelayMillis);
|
||||
final itemsPerPage = 20;
|
||||
final allEntries = <FriendicaEntry>[];
|
||||
|
||||
for (var page = 0; page < maxQueries; page++) {
|
||||
final timelineResult =
|
||||
await client.getTimeline(username, page, itemsPerPage);
|
||||
if (timelineResult.isFailure) {
|
||||
print('Error getting entries: ${timelineResult.error}');
|
||||
continue;
|
||||
}
|
||||
final entries = timelineResult.value;
|
||||
print('# Post/Comments returned for Page $page: ${entries.length}');
|
||||
allEntries.addAll(entries);
|
||||
for (final entry in entries) {
|
||||
final imageEntryResults = await imageArchive.addEntryImages(entry);
|
||||
if (entry.images.isNotEmpty) {
|
||||
print(
|
||||
'${imageEntryResults.length} new images of ${entry.images.length} in entry retrieved');
|
||||
}
|
||||
}
|
||||
if (entries.length != itemsPerPage) {
|
||||
print(
|
||||
'Returned less than a full page, assuming at end of timeline and quiting');
|
||||
break;
|
||||
}
|
||||
print("Sleeping for $queryDelayMillis milliseconds before next query");
|
||||
final postsJsonFile = p.join(baseDirectory.path, 'postsAndComments.json');
|
||||
final postsJson = allEntries.map((e) => e.originalJson).toList();
|
||||
File(postsJsonFile)
|
||||
.writeAsStringSync(PrettyJsonEncoder().convert(postsJson));
|
||||
print("Posts written to JSON file: $postsJsonFile");
|
||||
final imageArchiveJsonFilePath = p.join(baseDirectory.path, 'images.json');
|
||||
File(imageArchiveJsonFilePath)
|
||||
.writeAsStringSync(PrettyJsonEncoder().convert(imageArchive.images));
|
||||
print('Images directory saved to: $imageArchiveJsonFilePath');
|
||||
sleep(sleepDuration);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Seems in IntelliJ and release build mode setting echo fails
|
||||
void _setEcho(bool value) {
|
||||
try {
|
||||
stdin.echoMode = value;
|
||||
// ignore: empty_catches
|
||||
} catch (e) {
|
||||
print('');
|
||||
print('Error toggling echo to $value, so will stay current value...');
|
||||
}
|
||||
}
|
||||
ArgParser _buildArgs() => ArgParser()
|
||||
..addOption('archive-folder',
|
||||
abbr: 'a',
|
||||
help:
|
||||
'Specifies the local folder all data files pulled from the server will be stored',
|
||||
mandatory: true)
|
||||
..addOption('username',
|
||||
abbr: 'u', help: 'Username on your Friendica instance', mandatory: true)
|
||||
..addOption('server-name',
|
||||
abbr: 's',
|
||||
help:
|
||||
'The server name for your instance. (e.g. if the URL in your browser is "https://friendica.com/" then this would be "friendica.com',
|
||||
mandatory: true)
|
||||
..addOption('delay',
|
||||
abbr: 'd',
|
||||
help:
|
||||
'Delay in milliseconds between requests to try not to stress the server (thousands of API calls can be made)',
|
||||
defaultsTo: '$defaultRequestDelayMilliseconds')
|
||||
..addOption('max-post-requests',
|
||||
abbr: 'm',
|
||||
help: 'The maximum number of times to query for posts',
|
||||
defaultsTo: '$defaultMaxPostsQuery')
|
||||
..addFlag('read-comments',
|
||||
abbr: 'c',
|
||||
help:
|
||||
'Whether to read comments on posts (defaults to $defaultReadComments)',
|
||||
defaultsTo: defaultReadComments)
|
||||
..addFlag('download-images',
|
||||
abbr: 'i',
|
||||
help:
|
||||
'Whether to download images from posts when those images are stored on the server (not links to other sites) (defaults to $defaultReadImages)',
|
||||
defaultsTo: defaultReadComments);
|
||||
|
|
|
@ -3,13 +3,13 @@ import 'dart:io';
|
|||
|
||||
import 'package:result_monad/result_monad.dart';
|
||||
|
||||
import 'exec_error.dart';
|
||||
import 'models.dart';
|
||||
|
||||
class FriendicaClient {
|
||||
final String username;
|
||||
final String password;
|
||||
final String serverName;
|
||||
final _client = HttpClient();
|
||||
late final String _authHeader;
|
||||
|
||||
FriendicaClient(
|
||||
|
@ -21,23 +21,34 @@ class FriendicaClient {
|
|||
_authHeader = "Basic $encodedAuthString";
|
||||
}
|
||||
|
||||
FutureResult<List<FriendicaEntry>, String> getTimeline(
|
||||
FutureResult<List<FriendicaEntry>, ExecError> getTimeline(
|
||||
String userId, int page, int count) async {
|
||||
final request = Uri.parse(
|
||||
'https://$serverName/api/statuses/user_timelineuser_id=$userId&count=$count&page=$page');
|
||||
return (await _getApiRequest(request)).mapValue((postsJson) =>
|
||||
postsJson.map((postJson) => FriendicaEntry.fromJson(postJson)).toList());
|
||||
return (await _getApiRequest(request)).mapValue((postsJson) => postsJson
|
||||
.map((postJson) => FriendicaEntry.fromJson(postJson))
|
||||
.toList());
|
||||
}
|
||||
|
||||
FutureResult<List<dynamic>, String> _getApiRequest(Uri url) async {
|
||||
FutureResult<HttpClientResponse, ExecError> getUrl(Uri url) async {
|
||||
try {
|
||||
final request = await HttpClient().getUrl(url);
|
||||
request.headers.add('authorization', _authHeader);
|
||||
final response = await request.close();
|
||||
return Result.ok(response);
|
||||
} catch (e) {
|
||||
return Result.error(
|
||||
ExecError(type: ErrorType.localError, message: e.toString()));
|
||||
}
|
||||
}
|
||||
|
||||
FutureResult<List<dynamic>, ExecError> _getApiRequest(Uri url) async {
|
||||
// TODO Error mode against: bad server URL, bad auth, bad path, empty response
|
||||
final request = await _client.getUrl(url);
|
||||
request.headers.add('authorization', _authHeader);
|
||||
request.headers.contentType =
|
||||
ContentType('application', 'json', charset: 'utf-8');
|
||||
final response = await request.close();
|
||||
final body = await response.transform(utf8.decoder).join('');
|
||||
File('/tmp/response.json').writeAsStringSync(body);
|
||||
final responseResult = await getUrl(url);
|
||||
if (responseResult.isFailure) {
|
||||
return responseResult.mapValue((value) => <dynamic>[]);
|
||||
}
|
||||
final body = await responseResult.value.transform(utf8.decoder).join('');
|
||||
final bodyJson = jsonDecode(body) as List<dynamic>;
|
||||
return Result.ok(bodyJson);
|
||||
}
|
||||
|
|
97
bin/image_archiver.dart
Normal file
97
bin/image_archiver.dart
Normal file
|
@ -0,0 +1,97 @@
|
|||
import 'dart:convert';
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:path/path.dart' as p;
|
||||
import 'package:uuid/uuid.dart';
|
||||
|
||||
import 'extensions.dart';
|
||||
import 'friendica_client.dart';
|
||||
import 'models.dart';
|
||||
|
||||
class ImageArchiver {
|
||||
final FriendicaClient client;
|
||||
final _images = <String, ImageEntry>{};
|
||||
late final Directory imageDirectory;
|
||||
|
||||
List<ImageEntry> get images => List.unmodifiable(_images.values);
|
||||
|
||||
ImageArchiver(this.client, Directory baseDirectory) {
|
||||
final imageDirPath = p.join(baseDirectory.path, 'images');
|
||||
imageDirectory = Directory(imageDirPath);
|
||||
imageDirectory.createSync(recursive: true);
|
||||
}
|
||||
|
||||
Future<List<ImageEntry>> addEntryImages(FriendicaEntry entry) async {
|
||||
final imageEntries = <ImageEntry>[];
|
||||
for (final imageUrl in entry.images) {
|
||||
if (_images.containsKey(imageUrl)) {
|
||||
continue;
|
||||
}
|
||||
final url = Uri.parse(imageUrl);
|
||||
final imageResponse = await client.getUrl(url);
|
||||
if (imageResponse.isFailure) {
|
||||
print(imageResponse.error);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (imageResponse.value.statusCode == 200) {
|
||||
final contents = <int>[];
|
||||
await for (var data in imageResponse.value) {
|
||||
contents.addAll(data);
|
||||
}
|
||||
final extension = calculateExtensions(contents);
|
||||
final filename = Uuid().v4().replaceAll('-', '') + extension;
|
||||
final filePath = p.join(imageDirectory.path, filename);
|
||||
await File(filePath).writeAsBytes(contents);
|
||||
final newEntry = ImageEntry(
|
||||
postId: entry.id.toString(),
|
||||
localFilename: filename,
|
||||
url: imageUrl);
|
||||
_images[imageUrl] = newEntry;
|
||||
imageEntries.add(newEntry);
|
||||
} else {
|
||||
print(
|
||||
'Error response attempting to retrieve image $imageUrl: ${imageResponse.value.statusCode}');
|
||||
}
|
||||
}
|
||||
|
||||
return imageEntries;
|
||||
}
|
||||
|
||||
String calculateExtensions(List<int> imageBytes) {
|
||||
// Using table from https://www.sparkhound.com/blog/detect-image-file-types-through-byte-arrays
|
||||
final bmp = ascii.encode("BM").toList(); // BMP
|
||||
final gif = ascii.encode("GIF").toList(); // GIF
|
||||
const png = <int>[137, 80, 78, 71]; // PNG
|
||||
const tiff = <int>[73, 73, 42]; // TIFF
|
||||
const tiff2 = <int>[77, 77, 42]; // TIFF
|
||||
const jpeg = <int>[255, 216, 255, 224]; // jpeg
|
||||
const jpeg2 = <int>[255, 216, 255, 225]; // jpeg canon
|
||||
|
||||
final firstFour = imageBytes.sublist(0, 4);
|
||||
|
||||
if (firstFour.equals(jpeg) || firstFour.equals(jpeg2)) {
|
||||
return '.jpg';
|
||||
}
|
||||
|
||||
if (firstFour.equals(png)) {
|
||||
return '.png';
|
||||
}
|
||||
|
||||
final firstThree = imageBytes.sublist(0, 3);
|
||||
if (firstThree.equals(gif)) {
|
||||
return '.gif';
|
||||
}
|
||||
|
||||
if (firstThree.equals(tiff) || firstThree.equals(tiff2)) {
|
||||
return '.tif';
|
||||
}
|
||||
|
||||
final firstTwo = imageBytes.sublist(0, 2);
|
||||
if (firstTwo.equals(bmp)) {
|
||||
return '.bmp';
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
}
|
|
@ -46,3 +46,23 @@ class FriendicaEntry {
|
|||
.toList();
|
||||
}
|
||||
}
|
||||
|
||||
class ImageEntry {
|
||||
final String postId;
|
||||
final String localFilename;
|
||||
final String url;
|
||||
|
||||
ImageEntry(
|
||||
{required this.postId, required this.localFilename, required this.url});
|
||||
|
||||
ImageEntry.fromJson(Map<String, dynamic> json)
|
||||
: postId = json['postId'] ?? '',
|
||||
localFilename = json['localFilename'] ?? '',
|
||||
url = json['url'] ?? '';
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
'postId': postId,
|
||||
'localFilename': localFilename,
|
||||
'url': url,
|
||||
};
|
||||
}
|
||||
|
|
42
pubspec.lock
42
pubspec.lock
|
@ -8,6 +8,20 @@ packages:
|
|||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "2.3.0"
|
||||
collection:
|
||||
dependency: transitive
|
||||
description:
|
||||
name: collection
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.15.0"
|
||||
crypto:
|
||||
dependency: transitive
|
||||
description:
|
||||
name: crypto
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "3.0.1"
|
||||
lints:
|
||||
dependency: "direct dev"
|
||||
description:
|
||||
|
@ -15,6 +29,20 @@ packages:
|
|||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.0.1"
|
||||
logging:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
name: logging
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.0.2"
|
||||
path:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
name: path
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.8.1"
|
||||
result_monad:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
|
@ -22,5 +50,19 @@ packages:
|
|||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.0.2"
|
||||
typed_data:
|
||||
dependency: transitive
|
||||
description:
|
||||
name: typed_data
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "1.3.0"
|
||||
uuid:
|
||||
dependency: "direct main"
|
||||
description:
|
||||
name: uuid
|
||||
url: "https://pub.dartlang.org"
|
||||
source: hosted
|
||||
version: "3.0.5"
|
||||
sdks:
|
||||
dart: ">=2.15.1 <3.0.0"
|
||||
|
|
|
@ -8,7 +8,10 @@ environment:
|
|||
|
||||
dependencies:
|
||||
args: ^2.3.0
|
||||
logging: ^1.0.2
|
||||
path: ^1.8.1
|
||||
result_monad: ^1.0.2
|
||||
uuid: ^3.0.5
|
||||
|
||||
dev_dependencies:
|
||||
lints: ^1.0.0
|
||||
|
|
Loading…
Reference in a new issue