From c7a0f7b969e31f3ac7aac48bc5fc60d0de6c9f7d Mon Sep 17 00:00:00 2001 From: Sarah Zakarias Date: Tue, 3 Sep 2024 14:27:05 +0200 Subject: [PATCH] Download counts: handle several data files (#8011) --- .../download_counts/sync_download_counts.dart | 263 +++++++++-------- .../download_counts/download_counts_test.dart | 279 ++++++++++++++++-- .../fake_download_counts_data1.jsonl | 1 + 3 files changed, 398 insertions(+), 145 deletions(-) create mode 100644 app/test/service/download_counts/fake_download_counts_data1.jsonl diff --git a/app/lib/service/download_counts/sync_download_counts.dart b/app/lib/service/download_counts/sync_download_counts.dart index 7d5aa7043..93bba7e41 100644 --- a/app/lib/service/download_counts/sync_download_counts.dart +++ b/app/lib/service/download_counts/sync_download_counts.dart @@ -56,120 +56,152 @@ Map _extractDayCounts(Map json) { return dayCounts; } -Future processDownloadCounts( - String downloadCountsFileName, DateTime date) async { - final downloadCountsBucket = +Future> processDownloadCounts(DateTime date) async { + final fileNamePrefix = + ['daily_download_counts', formatDateForFileName(date), 'data-'].join('/'); + final bucket = storageService.bucket(activeConfiguration.downloadCountsBucketName!); - Uint8List bytes; - try { - bytes = await downloadCountsBucket.readAsBytes(downloadCountsFileName); - } on Exception catch (e) { - _logger.info('Failed to read "$downloadCountsFileName"./n' - '$e'); - return false; - } - if (bytes.isEmpty) { - _logger.severe('$downloadCountsFileName is empty.'); - return false; - } - bool hasFailedLines = false; - // Before '2024-05-03' the query generating the download count data had a bug - // in the reg exp causing incorrect versions to be stored. - final regExpQueryFixDate = DateTime.parse('2024-05-03'); - final processedPackages = {}; - var lines = []; - - try { - lines = utf8.decode(bytes).split('\n'); - } on FormatException catch (e) { - _logger.severe('Failed to utf8 decode bytes of $downloadCountsFileName/n' - '$e'); - return false; + final failedFiles = {}; + + final bucketEntries = await bucket.list(prefix: fileNamePrefix).toList(); + + if (bucketEntries.isEmpty) { + _logger.info('Failed to read any files with prefix "$fileNamePrefix"./n'); + failedFiles.add(fileNamePrefix); } + final processedPackages = {}; final pool = Pool(10); - await Future.wait(lines.map((line) async { - return await pool.withResource(() async { - if (line.isBlank) { - return; - } - final String package; - final Map dayCounts; - try { - final data = json.decode(line); - if (data is! Map) { - throw FormatException('Download counts data is not valid json'); - } + for (final f in bucketEntries) { + final fileName = f.name; + if (f.isDirectory) { + _logger.severe( + 'Cannot process $f, since $f is a directory. Expected a "jsonl" file.'); + failedFiles.add(fileName); + continue; + } + if (!f.name.endsWith('jsonl')) { + _logger.severe('Cannot process $f. Expected a "jsonl" file.'); + failedFiles.add(fileName); + continue; + } + + Uint8List bytes; + try { + bytes = await bucket.readAsBytes(fileName); + } on Exception catch (e) { + _logger.info('Failed to read "$fileName"./n' + '$e'); + failedFiles.add(fileName); + continue; + } + + if (bytes.isEmpty) { + _logger.severe('$fileName is empty.'); + failedFiles.add(fileName); + continue; + } + // Before '2024-05-03' the query generating the download count data had a bug + // in the reg exp causing incorrect versions to be stored. + final regExpQueryFixDate = DateTime.parse('2024-05-03'); + List lines; + + try { + lines = utf8.decode(bytes).split('\n'); + } on FormatException catch (e) { + _logger.severe('Failed to utf8 decode bytes of $fileName/n' + '$e'); + failedFiles.add(fileName); + continue; + } - if (data['package'] is! String) { - throw FormatException('"package" must be a String'); + await Future.wait(lines.map((line) async { + return await pool.withResource(() async { + if (line.isBlank) { + return; } - package = data['package'] as String; - processedPackages.add(package); - - dayCounts = _extractDayCounts(data); - } on FormatException catch (e) { - _logger.severe( - 'Failed to proccess line $line of file $downloadCountsFileName \n' - '$e'); - hasFailedLines = true; - return; - } - - List versions; - try { - // Validate that the package and version exist and ignore the - // non-existing packages and versions. - // First do it via the cached data, fall back to query for invisible - // and moderated packages. - versions = (await packageBackend.listVersionsCached(package)) - .versions - .map((e) => e.version) - .toList(); - - final nonExistingVersions = []; - dayCounts.keys.forEach((version) { - if (!versions.contains(version)) { - nonExistingVersions.add(version); - if (date.isBefore(regExpQueryFixDate)) { - // If the data is generated before the fix of the query, we - // ignore versions that do not exist. - _logger.warning( - '$package-$version appeared in download counts data but does' - ' not exist'); - } else { - _logger.severe( - '$package-$version appeared in download counts data but does' - ' not exist'); - } + final String package; + final Map dayCounts; + try { + final data = json.decode(line); + if (data is! Map) { + throw FormatException('Download counts data is not valid json'); } - }); - - nonExistingVersions.forEach((v) => dayCounts.remove(v)); - } on NotFoundException catch (e) { - final pkg = await packageBackend.lookupPackage(package); - // The package is neither invisible or tombstoned, hence there is - // probably an error in the generated data. - if (pkg == null && - (await packageBackend.lookupModeratedPackage(package)) == null) { - _logger.severe( - 'Package $package appeared in download counts data for file ' - '$downloadCountsFileName but does not exist.\n' - 'Error: $e'); + + if (data['package'] is! String) { + throw FormatException('"package" must be a String'); + } + package = data['package'] as String; + processedPackages.add(package); + + dayCounts = _extractDayCounts(data); + } on FormatException catch (e) { + _logger.severe('Failed to proccess line $line of file $fileName \n' + '$e'); + failedFiles.add(fileName); return; - } // else { - // The package is either invisible, tombstoned or has no versions. - // } - } - - await downloadCountsBackend.updateDownloadCounts( - package, - dayCounts, - date, - ); - }); - })); + } + + List versions; + try { + // Validate that the package and version exist and ignore the + // non-existing packages and versions. + // First do it via the cached data, fall back to query for invisible + // and moderated packages. + versions = (await packageBackend.listVersionsCached(package)) + .versions + .map((e) => e.version) + .toList(); + + final nonExistingVersions = []; + dayCounts.keys.forEach((version) { + if (!versions.contains(version)) { + nonExistingVersions.add(version); + if (date.isBefore(regExpQueryFixDate)) { + // If the data is generated before the fix of the query, we + // ignore versions that do not exist. + _logger.warning( + '$package-$version appeared in download counts data but does' + ' not exist'); + } else { + _logger.severe( + '$package-$version appeared in download counts data but does' + ' not exist'); + } + } + }); + + nonExistingVersions.forEach((v) => dayCounts.remove(v)); + } on NotFoundException catch (e) { + final pkg = await packageBackend.lookupPackage(package); + // The package is neither invisible or tombstoned, hence there is + // probably an error in the generated data. + if (pkg == null && + (await packageBackend.lookupModeratedPackage(package)) == null) { + _logger.severe( + 'Package $package appeared in download counts data for file ' + '$fileName but does not exist.\n' + 'Error: $e'); + return; + } // else { + // The package is either invisible, tombstoned or has no versions. + // } + } + await downloadCountsBackend.updateDownloadCounts( + package, + dayCounts, + date, + ); + }); + })); + } + + if (processedPackages.isEmpty) { + // We didn't successfully process any package. Either the bucket had no + // files or all files were invalid, hence we return early. + return failedFiles; + } // Record zero downloads for this date for packages not mentioned in the // query output. @@ -182,7 +214,8 @@ Future processDownloadCounts( await downloadCountsBackend.updateDownloadCounts(package, {}, date); }); })); - return !hasFailedLines; + + return failedFiles; } const defaultNumberOfSyncDays = 5; @@ -212,29 +245,19 @@ Future syncDownloadCounts( for (int i = 0; i < numberOfSyncDays; i++) { final syncDate = startingDate.addCalendarDays(-i); - // TODO(zarah): Handle the case where there is more than one file per day. - final fileName = [ - 'daily_download_counts', - formatDateForFileName(syncDate), - 'data-000000000000.jsonl', - ].join('/'); - final success = await processDownloadCounts(fileName, syncDate); - if (!success) { - failedFiles.add(fileName); - } + failedFiles.addAll(await processDownloadCounts(syncDate)); } - final yesterdayFileName = [ + final yesterdayFileNamePrefix = [ 'daily_download_counts', formatDateForFileName(yesterday), - 'data-000000000000.jsonl', ].join('/'); if (failedFiles.isNotEmpty) { _logger .shout('Download counts sync was partial. The following files failed:\n' '$failedFiles'); - - if (failedFiles.length > 1 || failedFiles.first != yesterdayFileName) { + if (!(failedFiles + .every((fileName) => fileName.contains(yesterdayFileNamePrefix)))) { // We only disregard failure of yesterday's file. Otherwise we consider // the sync to be broken. throw Exception( diff --git a/app/test/service/download_counts/download_counts_test.dart b/app/test/service/download_counts/download_counts_test.dart index 2ec8bef77..966f646ce 100644 --- a/app/test/service/download_counts/download_counts_test.dart +++ b/app/test/service/download_counts/download_counts_test.dart @@ -55,7 +55,7 @@ void main() { downloadCountsJsonFileName, path.join(Directory.current.path, 'test', 'service', 'download_counts', 'fake_download_counts_data.jsonl')); - await processDownloadCounts(downloadCountsJsonFileName, date); + await processDownloadCounts(date); final countData = await downloadCountsBackend.lookupDownloadCountData('neon'); @@ -76,7 +76,7 @@ void main() { downloadCountsJsonFileNameJan5, path.join(Directory.current.path, 'test', 'service', 'download_counts', 'fake_download_counts_data.jsonl')); - await processDownloadCounts(downloadCountsJsonFileNameJan5, date); + await processDownloadCounts(date); final nextDate = DateTime.parse('2024-01-06'); final downloadCountsJsonFileNameJan6 = @@ -89,7 +89,7 @@ void main() { 'service', 'download_counts', 'fake_download_counts_data_less_packages.jsonl')); - await processDownloadCounts(downloadCountsJsonFileNameJan6, nextDate); + await processDownloadCounts(nextDate); final countData = await downloadCountsBackend.lookupDownloadCountData('neon'); @@ -121,18 +121,17 @@ void main() { 'service', 'download_counts', 'fake_download_counts_data_faulty_line.jsonl')); - bool succeeded; + Set failedFiles; final messages = []; final subscription = Logger.root.onRecord.listen((event) { messages.add(event.message); }); try { - succeeded = await processDownloadCounts( - downloadCountsJsonFileNameJan6, nextDate); + failedFiles = await processDownloadCounts(nextDate); } finally { await subscription.cancel(); } - expect(succeeded, false); + expect(failedFiles, isNotEmpty); expect(messages.first, contains('Failed to proccess line')); expect(messages.first, contains('FormatException: "count" must be a String.')); @@ -159,18 +158,17 @@ void main() { 'service', 'download_counts', 'fake_download_counts_data_non_existing_package.jsonl')); - bool succeeded; + Set failedFiles; final messages = []; final subscription = Logger.root.onRecord.listen((event) { messages.add(event.message); }); try { - succeeded = await processDownloadCounts( - downloadCountsJsonFileNameJan6, nextDate); + failedFiles = await processDownloadCounts(nextDate); } finally { await subscription.cancel(); } - expect(succeeded, true); + expect(failedFiles, isEmpty); expect(messages.first, contains('Could not find `package "hest"`.')); // We still process the lines that are possible final countData = @@ -185,21 +183,19 @@ void main() { testWithProfile('file not present', fn: () async { final nextDate = DateTime.parse('2024-01-06'); - bool succeeded; + Set failedFiles; final messages = []; final subscription = Logger.root.onRecord.listen((event) { messages.add(event.message); }); try { - succeeded = - await processDownloadCounts('this_file_does_not_exist', nextDate); + failedFiles = await processDownloadCounts(nextDate); } finally { await subscription.cancel(); } - expect(succeeded, false); - expect(messages.first, - contains('Failed to read "this_file_does_not_exist".')); + expect(failedFiles, isNotEmpty); + expect(messages.first, contains('Failed to read')); }); testWithProfile('empty file', fn: () async { @@ -210,19 +206,18 @@ void main() { downloadCountsJsonFileNameJan6, path.join(Directory.current.path, 'test', 'service', 'download_counts', 'fake_download_counts_data_empty.jsonl')); - bool succeeded; + Set failedFiles; final messages = []; final subscription = Logger.root.onRecord.listen((event) { messages.add(event.message); }); try { - succeeded = await processDownloadCounts( - downloadCountsJsonFileNameJan6, nextDate); + failedFiles = await processDownloadCounts(nextDate); } finally { await subscription.cancel(); } - expect(succeeded, false); + expect(failedFiles, isNotEmpty); expect( messages, contains( @@ -338,16 +333,16 @@ void main() { ); expect( messages.first, - contains('Failed to read ' + contains('Failed to read any files with prefix ' '"daily_download_counts/' '${formatDateForFileName(yesterday)}' - '/data-000000000000.jsonl".')); + '/data-')); expect( messages, contains( 'Download counts sync was partial. The following files failed:\n' '[daily_download_counts/${formatDateForFileName(yesterday)}' - '/data-000000000000.jsonl]')); + '/data-]')); }); testWithProfile('Sync download counts - fail', fn: () async { @@ -367,19 +362,34 @@ void main() { 'download_counts', 'fake_download_counts_data.jsonl'), ); } + + final messages = []; + final subscription = Logger.root.onRecord.listen((event) { + messages.add(event.message); + }); + var exception = ''; try { await syncDownloadCounts(); } on Exception catch (e) { exception = e.toString(); + } finally { + await subscription.cancel(); } + expect( + messages.first, + contains('Failed to read any files with prefix ' + '"daily_download_counts/' + '${formatDateForFileName(skippedDate)}' + '/data-')); + expect( exception, 'Exception: Download counts sync was partial. The following files failed:' '[daily_download_counts/' '${formatDateForFileName(skippedDate)}' - '/data-000000000000.jsonl]'); + '/data-]'); final countData = await downloadCountsBackend.lookupDownloadCountData('neon'); @@ -394,5 +404,224 @@ void main() { [1, 1, 1, 1, 0, 0], ); }); + + testWithProfile('Sync download counts several data files - success', + fn: () async { + final today = clock.now(); + + for (int i = defaultNumberOfSyncDays; i > 0; i--) { + final date = today.addCalendarDays(-i); + final fileName = [ + 'daily_download_counts', + formatDateForFileName(date), + 'data-000000000000.jsonl', + ].join('/'); + + final fileName1 = [ + 'daily_download_counts', + formatDateForFileName(date), + 'data-000000000001.jsonl', + ].join('/'); + + await generateFakeDownloadCounts( + fileName, + path.join(Directory.current.path, 'test', 'service', + 'download_counts', 'fake_download_counts_data.jsonl'), + ); + + await generateFakeDownloadCounts( + fileName1, + path.join(Directory.current.path, 'test', 'service', + 'download_counts', 'fake_download_counts_data1.jsonl'), + ); + } + + await syncDownloadCounts(); + final countDataNeon = + await downloadCountsBackend.lookupDownloadCountData('neon'); + expect(countDataNeon, isNotNull); + expect(countDataNeon!.newestDate!.day, today.addCalendarDays(-1).day); + expect( + countDataNeon.totalCounts.take(6).toList(), + [1, 1, 1, 1, 1, -1], + ); + expect( + countDataNeon.majorRangeCounts.first.counts.take(6), + [1, 1, 1, 1, 1, 0], + ); + + final countDataFT = await downloadCountsBackend + .lookupDownloadCountData('flutter_titanium'); + expect(countDataFT, isNotNull); + expect(countDataFT!.newestDate!.day, today.addCalendarDays(-1).day); + expect( + countDataFT.totalCounts.take(6).toList(), + [1, 1, 1, 1, 1, -1], + ); + expect( + countDataFT.majorRangeCounts.first.counts.take(6), + [1, 1, 1, 1, 1, 0], + ); + }); + }); + + testWithProfile('Sync download counts several data files - success & failure', + fn: () async { + final today = clock.now(); + + for (int i = defaultNumberOfSyncDays; i > 0; i--) { + final date = today.addCalendarDays(-i); + final fileName = [ + 'daily_download_counts', + formatDateForFileName(date), + 'data-000000000000.jsonl', + ].join('/'); + + final fileName1 = [ + 'daily_download_counts', + formatDateForFileName(date), + 'data-000000000001.jsonl', + ].join('/'); + + await generateFakeDownloadCounts( + fileName, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data.jsonl'), + ); + + await generateFakeDownloadCounts( + fileName1, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data_empty.jsonl'), + ); + } + String exception = ''; + try { + await syncDownloadCounts(); + } on Exception catch (e) { + exception = e.toString(); + } + final countDataNeon = + await downloadCountsBackend.lookupDownloadCountData('neon'); + expect(countDataNeon, isNotNull); + expect(countDataNeon!.newestDate!.day, today.addCalendarDays(-1).day); + expect( + countDataNeon.totalCounts.take(6).toList(), + [1, 1, 1, 1, 1, -1], + ); + expect( + countDataNeon.majorRangeCounts.first.counts.take(6), + [1, 1, 1, 1, 1, 0], + ); + + expect( + exception, + contains( + 'Exception: Download counts sync was partial. The following files failed:' + '[daily_download_counts/' + '${formatDateForFileName(today.addCalendarDays(-1))}' + '/data-000000000001.jsonl')); + + final countDataFT = + await downloadCountsBackend.lookupDownloadCountData('flutter_titanium'); + expect(countDataFT, isNotNull); + expect(countDataFT!.newestDate!.day, today.addCalendarDays(-1).day); + expect( + countDataFT.totalCounts.take(6).toList(), + [0, 0, 0, 0, 0, -1], + ); + }); + testWithProfile('Sync download counts several data files - failure', + fn: () async { + final today = clock.now(); + + final goodDate = today.addCalendarDays(-2); + final fileName = [ + 'daily_download_counts', + formatDateForFileName(goodDate), + 'data-000000000000.jsonl', + ].join('/'); + + final fileName1 = [ + 'daily_download_counts', + formatDateForFileName(goodDate), + 'data-000000000001.jsonl', + ].join('/'); + + await generateFakeDownloadCounts( + fileName, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data.jsonl'), + ); + + await generateFakeDownloadCounts( + fileName1, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data1.jsonl'), + ); + final faultyDate = today.addCalendarDays(-1); + final fileName2 = [ + 'daily_download_counts', + formatDateForFileName(faultyDate), + 'data-000000000000.jsonl', + ].join('/'); + + final fileName3 = [ + 'daily_download_counts', + formatDateForFileName(faultyDate), + 'data-000000000001.jsonl', + ].join('/'); + + await generateFakeDownloadCounts( + fileName2, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data_empty.jsonl'), + ); + + await generateFakeDownloadCounts( + fileName3, + path.join(Directory.current.path, 'test', 'service', 'download_counts', + 'fake_download_counts_data_empty.jsonl'), + ); + + String exception = ''; + try { + await syncDownloadCounts(); + } on Exception catch (e) { + exception = e.toString(); + } + final countDataNeon = + await downloadCountsBackend.lookupDownloadCountData('neon'); + expect(countDataNeon, isNotNull); + expect(countDataNeon!.newestDate!.day, goodDate.day); + expect( + countDataNeon.totalCounts.take(6).toList(), + [1, -1, -1, -1, -1, -1], + ); + expect( + countDataNeon.majorRangeCounts.first.counts.take(6), + [1, 0, 0, 0, 0, 0], + ); + + expect( + exception, + contains( + 'Exception: Download counts sync was partial. The following files failed:' + '[daily_download_counts/' + '${formatDateForFileName(faultyDate)}' + '/data-000000000000.jsonl')); + + final countDataFT = + await downloadCountsBackend.lookupDownloadCountData('flutter_titanium'); + expect(countDataFT, isNotNull); + expect(countDataFT!.newestDate!.day, goodDate.day); + expect( + countDataFT.totalCounts.take(6).toList(), + [1, -1, -1, -1, -1, -1], + ); + expect( + countDataFT.majorRangeCounts.first.counts.take(6), + [1, 0, 0, 0, 0, 0], + ); }); } diff --git a/app/test/service/download_counts/fake_download_counts_data1.jsonl b/app/test/service/download_counts/fake_download_counts_data1.jsonl new file mode 100644 index 000000000..9da6fc805 --- /dev/null +++ b/app/test/service/download_counts/fake_download_counts_data1.jsonl @@ -0,0 +1 @@ +{"package":"flutter_titanium","total":"1","per_version":[{"version":"1.9.0","count":"1"}]}