diff --git a/COMMANDS.MD b/COMMANDS.MD index 0ecd4e3a..dc050be3 100644 --- a/COMMANDS.MD +++ b/COMMANDS.MD @@ -14,14 +14,15 @@ For an entire list of commands, you can run `php artisan list` - [Indexer](#indexer) - [Anime](#indexer-anime) - [Manga](#indexer-manga) - + - [Incremental](#indexer-incremental) + ## Commands ### Serve Command: `serve` Example: `php artisan serve` -Serve the application on the PHP development server +Serve the application on the PHP development server ### Queue @@ -98,7 +99,7 @@ This function only needs to be run once. Any entry's cache updating will automat Command: ``` -indexer:anime +indexer:manga {--failed : Run only entries that failed to index last time} {--resume : Resume from the last position} {--reverse : Start from the end of the array} @@ -109,3 +110,16 @@ indexer:anime Example: `indexer:manga` This simply translates to running the indexer without any additional configuration. + +#### Indexer: Incremental +Incrementally indexes media entries from MAL. +This command will compare the latest version of MAL ids from the [mal_id_cache](https://github.com/purarue/mal-id-cache) +github repository and compares them with the downloaded ids from the previous run. If no ids found from the previous run, a full indexing session is started. + +Command: +``` +indexer:incremental {mediaType*} + {--failed : Run only entries that failed to index last time} + {--resume : Resume from the last position} + {--delay=3 : Set a delay between requests} +``` diff --git a/app/Console/Commands/Indexer/AnimeIndexer.php b/app/Console/Commands/Indexer/AnimeIndexer.php index 73f375d5..8704feac 100644 --- a/app/Console/Commands/Indexer/AnimeIndexer.php +++ b/app/Console/Commands/Indexer/AnimeIndexer.php @@ -2,7 +2,6 @@ namespace App\Console\Commands\Indexer; -use App\Exceptions\Console\CommandAlreadyRunningException; use App\Exceptions\Console\FileNotFoundException; use Illuminate\Console\Command; use Illuminate\Support\Facades\Storage; diff --git a/app/Console/Commands/Indexer/IncrementalIndexer.php b/app/Console/Commands/Indexer/IncrementalIndexer.php new file mode 100644 index 00000000..eb076ac0 --- /dev/null +++ b/app/Console/Commands/Indexer/IncrementalIndexer.php @@ -0,0 +1,228 @@ + ['The media type to index.', 'Valid values: anime, manga'] + ]; + } + + private function getExistingIds(string $mediaType): array + { + $existingIdsHash = ""; + $existingIdsRaw = ""; + + if (Storage::exists("indexer/incremental/$mediaType.json")) + { + $existingIdsRaw = Storage::get("indexer/incremental/$mediaType.json"); + $existingIdsHash = sha1($existingIdsRaw); + } + + return [$existingIdsHash, $existingIdsRaw]; + } + + private function getIdsToFetch(string $mediaType): array + { + $idsToFetch = []; + [$existingIdsHash, $existingIdsRaw] = $this->getExistingIds($mediaType); + + if ($this->cancelled) + { + return []; + } + + $newIdsRaw = file_get_contents("https://raw.githubusercontent.com/purarue/mal-id-cache/master/cache/${mediaType}_cache.json"); + $newIdsHash = sha1($newIdsRaw); + + /** @noinspection PhpConditionAlreadyCheckedInspection */ + if ($this->cancelled) + { + return []; + } + + if ($newIdsHash !== $existingIdsHash) + { + $newIds = json_decode($newIdsRaw, true); + $existingIds = json_decode($existingIdsRaw, true); + + if (is_null($existingIds) || count($existingIds) === 0) + { + $idsToFetch = $newIds; + } + else + { + foreach (["sfw", "nsfw"] as $t) + { + $idsToFetch[$t] = array_diff($existingIds[$t], $newIds[$t]); + } + } + + Storage::put("indexer/incremental/$mediaType.json.tmp", $newIdsRaw); + } + + return $idsToFetch; + } + + private function getFailedIdsToFetch(string $mediaType): array + { + return json_decode(Storage::get("indexer/incremental/{$mediaType}_failed.json")); + } + + private function fetchIds(string $mediaType, array $idsToFetch, bool $resume): void + { + $index = 0; + $success = []; + $failedIds = []; + $idCount = count($idsToFetch); + if ($resume && Storage::exists("indexer/incremental/{$mediaType}_resume.save")) + { + $index = (int)Storage::get("indexer/incremental/{$mediaType}_resume.save"); + $this->info("Resuming from index: $index"); + } + + $ids = array_merge($idsToFetch['sfw'], $idsToFetch['nsfw']); + + if ($index > 0 && !isset($ids[$index])) + { + $index = 0; + $this->warn('Invalid index; set back to 0'); + } + + Storage::put("indexer/incremental/{$mediaType}_resume.save", 0); + + $this->info("$idCount $mediaType entries available"); + + for ($i = $index; $i <= ($idCount - 1); $i++) + { + if ($this->cancelled) + { + return; + } + + $id = $ids[$index]; + + $url = env('APP_URL') . "/v4/$mediaType/$id"; + $this->info("Indexing/Updating " . ($i + 1) . "/$idCount $url [MAL ID: $id]"); + + try + { + $response = json_decode(file_get_contents($url), true); + if (!isset($response['error']) || $response['status'] == 404) + { + continue; + } + + $this->error("[SKIPPED] Failed to fetch $url - {$response['error']}"); + } + catch (\Exception) + { + $this->warn("[SKIPPED] Failed to fetch $url"); + $failedIds[] = $id; + Storage::put("indexer/incremental/$mediaType.failed", json_encode($failedIds)); + } + + $success[] = $id; + Storage::put("indexer/incremental/{$mediaType}_resume.save", $index); + } + + Storage::delete("indexer/incremental/{$mediaType}_resume.save"); + + $this->info("--- Indexing of $mediaType is complete."); + $this->info(count($success) . ' entries indexed or updated.'); + if (count($failedIds) > 0) + { + $this->info(count($failedIds) . ' entries failed to index or update. Re-run with --failed to requeue failed entries only.'); + } + + // finalize the latest state + Storage::move("indexer/incremental/$mediaType.json.tmp", "indexer/incremental/$mediaType.json"); + } + + public function handle(): int + { + // validate inputs + $validator = Validator::make( + [ + 'mediaType' => $this->argument('mediaType'), + 'delay' => $this->option('delay'), + 'resume' => $this->option('resume') ?? false, + 'failed' => $this->option('failed') ?? false + ], + [ + 'mediaType' => 'required|in:anime,manga', + 'delay' => 'integer|min:1', + 'resume' => 'bool|prohibited_with:failed', + 'failed' => 'bool|prohibited_with:resume' + ] + ); + + if ($validator->fails()) { + $this->error($validator->errors()->toJson()); + return 1; + } + + // we want to handle signals from the OS + $this->trap([SIGTERM, SIGQUIT, SIGINT], fn () => $this->cancelled = true); + + $resume = $this->option('resume') ?? false; + $onlyFailed = $this->option('failed') ?? false; + + /** + * @var $mediaTypes array + */ + $mediaTypes = $this->argument("mediaType"); + + foreach ($mediaTypes as $mediaType) + { + $idsToFetch = []; + + // if "--failed" option is specified just run the failed ones + if ($onlyFailed && Storage::exists("indexer/incremental/{$mediaType}_failed.json")) + { + $idsToFetch["sfw"] = $this->getFailedIdsToFetch($mediaType); + } + else + { + $idsToFetch = $this->getIdsToFetch($mediaType); + } + + if ($this->cancelled) + { + return 127; + } + + $idCount = count($idsToFetch); + if ($idCount === 0) + { + continue; + } + + $this->fetchIds($mediaType, $idsToFetch, $resume); + } + + return 0; + } +} diff --git a/app/Console/Kernel.php b/app/Console/Kernel.php index 22c9b19c..eafe5553 100644 --- a/app/Console/Kernel.php +++ b/app/Console/Kernel.php @@ -24,7 +24,8 @@ class Kernel extends ConsoleKernel Indexer\GenreIndexer::class, Indexer\ProducersIndexer::class, Indexer\AnimeSweepIndexer::class, - Indexer\MangaSweepIndexer::class + Indexer\MangaSweepIndexer::class, + Indexer\IncrementalIndexer::class ]; /** diff --git a/app/Features/QuerySpecificAnimeSeasonHandler.php b/app/Features/QuerySpecificAnimeSeasonHandler.php index 1ec68210..e073fa0f 100644 --- a/app/Features/QuerySpecificAnimeSeasonHandler.php +++ b/app/Features/QuerySpecificAnimeSeasonHandler.php @@ -3,8 +3,6 @@ namespace App\Features; use App\Dto\QuerySpecificAnimeSeasonCommand; -use App\Enums\AnimeSeasonEnum; -use App\Enums\AnimeStatusEnum; use App\Enums\AnimeTypeEnum; use Illuminate\Contracts\Database\Query\Builder; use Illuminate\Support\Carbon; diff --git a/composer.json b/composer.json index 8234697c..55e6a704 100644 --- a/composer.json +++ b/composer.json @@ -14,6 +14,7 @@ "php": "^8.1", "ext-json": "*", "ext-mongodb": "*", + "ext-pcntl": "*", "amphp/http-client": "^4.6", "danielmewes/php-rql": "dev-master", "darkaonline/swagger-lume": "^9.0", diff --git a/container-setup.sh b/container-setup.sh index 89c39c61..3cc1f5c0 100755 --- a/container-setup.sh +++ b/container-setup.sh @@ -34,6 +34,7 @@ display_help() { echo "stop Stop Jikan API" echo "validate-prereqs Validate pre-reqs installed (docker, docker-compose)" echo "execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days)" + echo "index-incrementally Executes the incremental indexers for each media type. (anime, manga)" echo "" } @@ -168,6 +169,10 @@ case "$1" in $DOCKER_COMPOSE_CMD -p "$DOCKER_COMPOSE_PROJECT_NAME" exec jikan_rest php /app/artisan indexer:producers echo "Indexing done!" ;; + "index-incrementally") + echo "Indexing..." + $DOCKER_COMPOSE_CMD -p "$DOCKER_COMPOSE_PROJECT_NAME" exec jikan_rest php /app/artisan indexer:incremental anime manga + echo "Indexing done!" *) echo "No command specified, displaying help" display_help diff --git a/container_usage.md b/container_usage.md index bce6bd74..f20e1103 100644 --- a/container_usage.md +++ b/container_usage.md @@ -16,6 +16,9 @@ This will: > **Note**: The script supports both `docker` and `podman`. In case of `podman` please bare in mind that sometimes the container name resolution doesn't work on the container network. > In those cases you might have to install `aardvark-dns` package. On `Arch Linux` podman uses `netavark` network by default (in 2023) so you will need to install the before mentioned package. +> **Note 2**: The script will start the jikan API, but if you start it for the first time, it won't have any data in it! +> You will have to run the indexers through artisan to have data. See ["Running the indexer with the script"](#running-the-indexer-with-the-script) section. + The script has the following prerequisites and will notify you if these are not present: - git @@ -36,6 +39,7 @@ start Start Jikan API (mongodb, typesense, redis, jikan-api wor stop Stop Jikan API validate-prereqs Validate pre-reqs installed (docker, docker-compose) execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days) +index-incrementally Executes the incremental indexers for each media type. (anime, manga) ``` ### Running the indexer with the script