From a106f248d8435275675eaf905e55b358fbff750b Mon Sep 17 00:00:00 2001 From: Matthew Hilton Date: Fri, 23 Aug 2024 16:23:43 +1000 Subject: [PATCH] tagging: move mimetype to metadata, add location/orphan tag source --- TAGGING.md | 61 +++++---------- classes/local/manager.php | 24 ++++-- .../local/object_manipulator/manipulator.php | 2 +- classes/local/store/object_file_system.php | 76 +++++++++++++++++-- classes/local/store/s3/client.php | 11 ++- classes/local/store/s3/file_system.php | 3 +- classes/local/tag/environment_source.php | 4 +- ...me_type_source.php => location_source.php} | 22 ++---- classes/local/tag/tag_manager.php | 2 +- lang/en/tool_objectfs.php | 4 +- tests/local/tagging_test.php | 23 +++++- tests/object_file_system_test.php | 59 +++++++++++--- 12 files changed, 198 insertions(+), 93 deletions(-) rename classes/local/tag/{mime_type_source.php => location_source.php} (68%) diff --git a/TAGGING.md b/TAGGING.md index 0c2155e8..c7795714 100644 --- a/TAGGING.md +++ b/TAGGING.md @@ -6,7 +6,7 @@ Currently, this is only implemented for the S3 file system client. Note object tags are different from object metadata. -Object metadata is immutable, and attached to the object on upload. With metadata, if you wish to update it (for example during a migration, or the sources changed), you have to copy the object with the new metadata, and delete the old object. This is problematic, since deletion is optional in objectfs. +Object metadata is immutable, and attached to the object on upload. With metadata, if you wish to update it (for example during a migration, or the sources changed), you have to copy the object with the new metadata, and delete the old object. This is not ideal, since deletion is optional in objectfs. Object tags are more suitable, since their permissions can be managed separately (e.g. a client can be allowed to modify tags, but not delete objects). @@ -21,54 +21,33 @@ The following sources are implemented currently: ### Environment What environment the file was uploaded in. Configure the environment using `$CFG->objectfs_environment_name` -### Mimetype -What mimetype the file is stored as under the `mdl_files` table. - -## Multiple environments pointing to single bucket -It is possible you are using objectfs with multiple environments (e.g. prod, staging) that both point to the same bucket. Since files are referenced by contenthash, it generally does not matter where they come from, so this isn't a problem. However to ensure the tags remain accurate, you should turn off `overwriteobjecttags` in the plugin settings for every environment except production. - -This means that staging is unable to overwrite tags for files uploaded elsewhere, but can set it on files only uploaded only from staging. However, files uploaded from production will always have the correct tags, and will overwrite any existing tags. - -```mermaid -graph LR - subgraph S3 - Object("`**Object** - contenthash: xyz - tags: env=prod`") - end - subgraph Prod - UploadObjectProd["`**Upload object** - contenthash: xyz - tags: env=prod`"] --> Object - end - subgraph Staging - UploadObjectStaging["`**Upload object** - contenthash: xyz - tags: env=staging`"] - end - Blocked["Blocked - does not have permissions\nto overwrite existing object tags"] - UploadObjectStaging --- Blocked - Blocked -.-> Object - - style Object fill:#ffffff00,stroke:#ffa812 - style S3 fill:#ffffff00,stroke:#ffa812 - style Prod fill:#ffffff00,stroke:#26ff4a - style UploadObjectProd fill:#ffffff00,stroke:#26ff4a - style Staging fill:#ffffff00,stroke:#978aff - style UploadObjectStaging fill:#ffffff00,stroke:#978aff - style Blocked fill:#ffffff00,stroke:#ff0000 -``` +This tag is also used by objectfs to determine if tags can be overwritten. See [Multiple environments setup](#multiple-environments-setup) for more information. + +### Location +Either `orphan` if the file no longer exists in the `files` table in Moodle, otherwise `active`. + +## Multiple environments setup +This feature is designed to work in situations where multiple environments (e.g. prod, staging) points to the same bucket, however, some setup is needed: + +1. Turn off `overwriteobjecttags` in every environment except the production environment. +2. Configure `$CFG->objectfs_environment_name` to be unique for all environments. + +By doing the above two steps, it will allow the production environment to always set its own tags, even if a file was first uploaded to staging and then to production. + +Lower environments can still update tags, but only if the `environment` matches theirs. This allows staging to manage object tags on objects only it knows about, but as soon as the file is uploaded from production (and therefore have it's environment tag replaced with `prod`), staging will no longer touch it. ## Migration -If the way a tag was calculated has changed, or new tags are added (or removed) or this feature was turned on for the first time (or turned on after being off), you must do the following: -- Manually run `trigger_update_object_tags` scheduled task from the UI, which queues a `update_object_tags` adhoc task that will process all objects marked as needing sync (default is true) +Only new objects uploaded after enabling this feature will have tags added. To backfill tags for previously uploaded objects, you must do the following: + +- Manually run `trigger_update_object_tags` scheduled task from the UI, which queues a `update_object_tags` adhoc task that will process all objects marked as needing sync. or - Call the CLI to execute a `update_object_tags` adhoc task manually. +You may need to update the DB to mark objects tag sync status as needing sync if the object has previously been synced before. ## Reporting There is an additional graph added to the object summary report showing the tag value combinations and counts of each. -Note, this is only for files that have been uploaded from this environment, and may not be consistent for environments where `overwriteobjecttags` is disabled (because the site does not know if a file was overwritten in the external store by another client). +Note, this is only for files that have been uploaded from the respective environment, and may not be consistent for environments where `overwriteobjecttags` is disabled (because the site does not know if a file was overwritten in the external store by another client). ## For developers diff --git a/classes/local/manager.php b/classes/local/manager.php index 188df9fc..19694318 100644 --- a/classes/local/manager.php +++ b/classes/local/manager.php @@ -27,6 +27,7 @@ use stdClass; use tool_objectfs\local\store\object_file_system; +use tool_objectfs\local\tag\tag_manager; /** * [Description manager] @@ -160,7 +161,7 @@ public static function update_object_by_hash($contenthash, $newlocation, $filesi $newobject->filesize = isset($oldobject->filesize) ? $oldobject->filesize : $DB->get_field('files', 'filesize', ['contenthash' => $contenthash], IGNORE_MULTIPLE); - return self::update_object($newobject, $newlocation); + return self::upsert_object($newobject, $newlocation); } $newobject->location = $newlocation; @@ -173,9 +174,7 @@ public static function update_object_by_hash($contenthash, $newlocation, $filesi $newobject->filesize = $filesize; $newobject->timeduplicated = time(); } - $DB->insert_record('tool_objectfs_objects', $newobject); - - return $newobject; + return self::upsert_object($newobject, $newlocation); } /** @@ -185,7 +184,7 @@ public static function update_object_by_hash($contenthash, $newlocation, $filesi * @return stdClass * @throws \dml_exception */ - public static function update_object(stdClass $object, $newlocation) { + public static function upsert_object(stdClass $object, $newlocation) { global $DB; // If location change is 'duplicated' we update timeduplicated. @@ -193,8 +192,21 @@ public static function update_object(stdClass $object, $newlocation) { $object->timeduplicated = time(); } + $locationchanged = !isset($object->location) || $object->location != $newlocation; $object->location = $newlocation; - $DB->update_record('tool_objectfs_objects', $object); + + // If id is set, update, else insert new. + if (empty($object->id)) { + $object->id = $DB->insert_record('tool_objectfs_objects', $object); + } else { + $DB->update_record('tool_objectfs_objects', $object); + } + + // Post update, notify tag manager since the location tag likely needs changing. + if ($locationchanged && tag_manager::is_tagging_enabled_and_supported()) { + $fs = get_file_storage()->get_file_system(); + $fs->push_object_tags($object->contenthash); + } return $object; } diff --git a/classes/local/object_manipulator/manipulator.php b/classes/local/object_manipulator/manipulator.php index f5108305..afd21808 100644 --- a/classes/local/object_manipulator/manipulator.php +++ b/classes/local/object_manipulator/manipulator.php @@ -111,7 +111,7 @@ public function execute(array $objectrecords) { $newlocation = $this->manipulate_object($objectrecord); if (!empty($objectrecord->id)) { - manager::update_object($objectrecord, $newlocation); + manager::upsert_object($objectrecord, $newlocation); } else { manager::update_object_by_hash($objectrecord->contenthash, $newlocation); } diff --git a/classes/local/store/object_file_system.php b/classes/local/store/object_file_system.php index 3cfe18b0..6ecdfa83 100644 --- a/classes/local/store/object_file_system.php +++ b/classes/local/store/object_file_system.php @@ -39,6 +39,7 @@ use coding_exception; use Throwable; use tool_objectfs\local\manager; +use tool_objectfs\local\tag\environment_source; use tool_objectfs\local\tag\tag_manager; defined('MOODLE_INTERNAL') || die(); @@ -167,6 +168,23 @@ protected function get_local_path_from_hash($contenthash, $fetchifnotfound = fal return $path; } + /** + * Returns mimetype for a given hash + * @param string $contenthash + * @return string mimetype as stored in mdl_files + */ + protected function get_mimetype_from_hash(string $contenthash): string { + global $DB; + + // We limit 1 because multiple files can have the same contenthash. + // However, they all have the same mimetype so it does not matter which one we query. + return $DB->get_field_sql('SELECT mimetype + FROM {files} + WHERE contenthash = :hash + LIMIT 1', + ['hash' => $contenthash]); + } + /** * get_remote_path_from_storedfile * @param \stored_file $file @@ -1185,13 +1203,7 @@ public function push_object_tags(string $contenthash) { } try { - $objectexists = $this->is_file_readable_externally_by_hash($contenthash); - - // Object must exist, and we can overwrite (and not care about existing tags) - // or cannot overwrite, and the tags are empty. - // Avoid unnecessarily checking tags, since this is an extra API call. - $canset = $objectexists && (tag_manager::can_overwrite_object_tags() || - empty($this->get_external_client()->get_object_tags($contenthash))); + $canset = $this->can_set_object_tags($contenthash); $timepushed = 0; if ($canset) { @@ -1200,7 +1212,7 @@ public function push_object_tags(string $contenthash) { tag_manager::store_tags_locally($contenthash, $tags); // Record the time it was actually pushed to the external store - // (i.e. not when it existed already and we pulled the tags down). + // (i.e. not when it existed already and was skipped). $timepushed = time(); } @@ -1216,4 +1228,52 @@ public function push_object_tags(string $contenthash) { } $lock->release(); } + + /** + * Returns true if the current env can set the given object's tags. + * + * To set the tags: + * - The object must exist + * - We can overwrite tags (and not care about any existing) + * OR + * - We cannot overwrite tags, and the tags are empty or the environment is the same as ours. + * + * Avoids unnecessarily querying tags as this is an extra api call to the object store. + * + * @param string $contenthash + * @return bool + */ + private function can_set_object_tags(string $contenthash): bool { + $objectexists = $this->is_file_readable_externally_by_hash($contenthash); + + // Object must exist, we cannot set tags on an object that is missing. + if (!$objectexists) { + return false; + } + + // If can overwrite tags, we don't care then about any existing tags. + if (tag_manager::can_overwrite_object_tags()) { + return true; + } + + // Else we need to check the tags are empty, or the env matches ours. + $existingtags = $this->get_external_client()->get_object_tags($contenthash); + + // Not set yet, must be a new object. + if (empty($existingtags) || !isset($existingtags[environment_source::get_identifier()])) { + return true; + } + + // TODO unit test this case. + $envsource = new environment_source(); + $currentenv = $envsource->get_value_for_contenthash($contenthash); + + // Env is the same as ours, allowed to set. + if ($existingtags[environment_source::get_identifier()] == $currentenv) { + return true; + } + + // Else no match, do not set. + return false; + } } diff --git a/classes/local/store/s3/client.php b/classes/local/store/s3/client.php index 767e177e..e3ea37ef 100644 --- a/classes/local/store/s3/client.php +++ b/classes/local/store/s3/client.php @@ -496,10 +496,11 @@ public function define_client_section($settings, $config) { * * @param string $localpath Path to a local file. * @param string $contenthash Content hash of the file. + * @param string $mimetype the mimetype of the file being uploaded * * @throws \Exception if fails. */ - public function upload_to_s3($localpath, $contenthash) { + public function upload_to_s3($localpath, $contenthash, string $mimetype) { $filehandle = fopen($localpath, 'rb'); if (!$filehandle) { @@ -511,7 +512,13 @@ public function upload_to_s3($localpath, $contenthash) { $uploader = new \Aws\S3\ObjectUploader( $this->client, $this->bucket, $this->bucketkeyprefix . $externalpath, - $filehandle + $filehandle, + 'private', + [ + 'params' => [ + 'ContentType' => $mimetype, + ], + ] ); $uploader->upload(); fclose($filehandle); diff --git a/classes/local/store/s3/file_system.php b/classes/local/store/s3/file_system.php index dd911092..ffff0ce4 100644 --- a/classes/local/store/s3/file_system.php +++ b/classes/local/store/s3/file_system.php @@ -85,9 +85,10 @@ public function readfile(\stored_file $file) { */ public function copy_from_local_to_external($contenthash) { $localpath = $this->get_local_path_from_hash($contenthash); + $mime = $this->get_mimetype_from_hash($contenthash); try { - $this->get_external_client()->upload_to_s3($localpath, $contenthash); + $this->get_external_client()->upload_to_s3($localpath, $contenthash, $mime); return true; } catch (\Exception $e) { $this->get_logger()->error_log( diff --git a/classes/local/tag/environment_source.php b/classes/local/tag/environment_source.php index d7715ddb..127b08fc 100644 --- a/classes/local/tag/environment_source.php +++ b/classes/local/tag/environment_source.php @@ -17,7 +17,7 @@ namespace tool_objectfs\local\tag; /** - * Provides environment a file was uploaded in. + * Provides current environment to file. * * @package tool_objectfs * @author Matthew Hilton @@ -53,7 +53,7 @@ private static function get_env(): ?string { /** * Returns the tag value for the given file contenthash * @param string $contenthash - * @return string|null mime type for file. + * @return string|null environment value. */ public function get_value_for_contenthash(string $contenthash): ?string { return self::get_env(); diff --git a/classes/local/tag/mime_type_source.php b/classes/local/tag/location_source.php similarity index 68% rename from classes/local/tag/mime_type_source.php rename to classes/local/tag/location_source.php index 7546cdcf..1353a03a 100644 --- a/classes/local/tag/mime_type_source.php +++ b/classes/local/tag/location_source.php @@ -17,20 +17,20 @@ namespace tool_objectfs\local\tag; /** - * Provides mime type of file. + * Provides location status for a file. * * @package tool_objectfs * @author Matthew Hilton * @copyright Catalyst IT * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later */ -class mime_type_source implements tag_source { +class location_source implements tag_source { /** * Identifier used in tagging file. Is the 'key' of the tag. * @return string */ public static function get_identifier(): string { - return 'mimetype'; + return 'location'; } /** @@ -38,7 +38,7 @@ public static function get_identifier(): string { * @return string */ public static function get_description(): string { - return get_string('tagsource:mimetype', 'tool_objectfs'); + return get_string('tagsource:location', 'tool_objectfs'); } /** @@ -48,18 +48,10 @@ public static function get_description(): string { */ public function get_value_for_contenthash(string $contenthash): ?string { global $DB; - // Sometimes multiple with same hash are uploaded (e.g. real vs draft), - // in this case, just take the first (mimetype is the same regardless). - $mime = $DB->get_field_sql('SELECT mimetype - FROM {files} - WHERE contenthash = :hash - LIMIT 1', - ['hash' => $contenthash]); - if (empty($mime)) { - return null; - } + $isorphaned = $DB->record_exists('tool_objectfs_objects', ['contenthash' => $contenthash, + 'location' => OBJECT_LOCATION_ORPHANED]); - return $mime; + return $isorphaned ? 'orphan' : 'active'; } } diff --git a/classes/local/tag/tag_manager.php b/classes/local/tag/tag_manager.php index 43fdf7a4..058b82f6 100644 --- a/classes/local/tag/tag_manager.php +++ b/classes/local/tag/tag_manager.php @@ -66,8 +66,8 @@ public static function get_defined_tag_sources(): array { // All possible tag sources should be defined here. // Note this should be a maximum of 10 sources, as this is an AWS limit. return [ - new mime_type_source(), new environment_source(), + new location_source(), ]; } diff --git a/lang/en/tool_objectfs.php b/lang/en/tool_objectfs.php index b8ac6386..6f43efff 100644 --- a/lang/en/tool_objectfs.php +++ b/lang/en/tool_objectfs.php @@ -278,7 +278,7 @@ $string['settings:maxtaggingiterations'] = 'Object tagging adhoc sync maximum number of iterations '; $string['settings:maxtaggingiterations:desc'] = 'The maximum number of times the tagging sync adhoc task will requeue itself. To avoid accidental infinite runaway.'; $string['settings:overrideobjecttags'] = 'Allow object tag override'; -$string['settings:overrideobjecttags:desc'] = 'Allows ObjectFS to overwrite tags on objects that already exist in the external store.'; +$string['settings:overrideobjecttags:desc'] = 'Allows ObjectFS to overwrite tags on objects that already exist in the external store. If not checked, objectfs will only set tags when the objects "environment" value is empty or is the same as currently defined.'; $string['settings:tagsources'] = 'Tag sources'; $string['settings:taggingstatus'] = 'Tagging status'; $string['settings:taggingstatuscounts'] = 'Tag sync status overview'; @@ -298,7 +298,7 @@ $string['check:tagging:error'] = 'Error trying to tag object'; $string['tagsource:environment'] = 'Environment defined by $CFG->objectfs_environment_name, currently: "{$a}".'; -$string['tagsource:mimetype'] = 'File mimetype as stored in {files} table'; +$string['tagsource:location'] = 'Location of file, either "orphan" or "active".'; $string['task:triggerupdateobjecttags'] = 'Queue adhoc task to update object tags'; diff --git a/tests/local/tagging_test.php b/tests/local/tagging_test.php index 144e2191..1b6a2d9a 100644 --- a/tests/local/tagging_test.php +++ b/tests/local/tagging_test.php @@ -153,10 +153,29 @@ public function test_gather_object_tags_for_upload() { $object = $this->create_duplicated_object('gather tags for upload test'); $tags = tag_manager::gather_object_tags_for_upload($object->contenthash); - $this->assertArrayHasKey('mimetype', $tags); $this->assertArrayHasKey('environment', $tags); - $this->assertEquals('text', $tags['mimetype']); $this->assertEquals('test', $tags['environment']); + $this->assertArrayHasKey('location', $tags); + $this->assertEquals('active', $tags['location']); + } + + /** + * Tests gather_object_tags_for_upload when orphaned + * @covers \tool_objectfs\local\tag_manager::gather_object_tags_for_upload + */ + public function test_gather_object_tags_for_upload_orphaned() { + global $DB; + $object = $this->create_duplicated_object('gather tags for upload test'); + + // Change the object record to be orphaned. + $DB->update_record('tool_objectfs_objects', ['id' => $object->id, 'location' => OBJECT_LOCATION_ORPHANED]); + + $tags = tag_manager::gather_object_tags_for_upload($object->contenthash); + + $this->assertArrayHasKey('environment', $tags); + $this->assertEquals('test', $tags['environment']); + $this->assertArrayHasKey('location', $tags); + $this->assertEquals('orphan', $tags['location']); } /** diff --git a/tests/object_file_system_test.php b/tests/object_file_system_test.php index 1fb6a785..6a52b354 100644 --- a/tests/object_file_system_test.php +++ b/tests/object_file_system_test.php @@ -1055,35 +1055,68 @@ public function test_push_object_tags_object_not_replicated() { */ public static function push_object_tags_replicated_provider(): array { return [ - 'can override' => [ + // Can override, doesn't matter if envs are different. + 'can override - different env' => [ + 'object env' => 'prod', + 'push env' => 'staging', 'can override' => true, + 'expected override' => true, ], - 'cannot override' => [ - 'cannot override' => false, + 'can override - same env' => [ + 'object env' => 'prod', + 'push env' => 'prod', + 'can override' => true, + 'expected override' => true, + ], + 'can override - empty env' => [ + 'object env' => '', + 'push env' => 'prod', + 'can override' => true, + 'expected override' => true, + ], + // Cannot override, env must match or be empty. + 'cannot override - same env' => [ + 'object env' => 'prod', + 'push env' => 'prod', + 'can override' => false, + 'expected override' => true, + ], + 'cannot override - different env' => [ + 'object env' => 'prod', + 'push env' => 'staging', + 'can override' => false, + 'expected override' => false, + ], + 'cannot override - env is empty' => [ + 'object env' => '', + 'push env' => 'staging', + 'can override' => false, + 'expected override' => true, ], ]; } /** * Tests push_object_tags when the object is replicated. + * Tests rules around overriding are correctly applied. + * + * @param string $objectenv the env to set when 'uploading' the object + * @param string $pushenv the env to set when trying to push new tags * @param bool $canoverride if filesystem should be able to overwrite existing objects + * @param bool $expectedoverride if it was expected that the tags were overwritten. * @dataProvider push_object_tags_replicated_provider */ - public function test_push_object_tags_replicated(bool $canoverride) { + public function test_push_object_tags_replicated(string $objectenv, string $pushenv, bool $canoverride, + bool $expectedoverride) { global $CFG, $DB; $CFG->phpunit_objectfs_supports_object_tagging = true; + $CFG->objectfs_environment_name = $objectenv; set_config('overwriteobjecttags', $canoverride, 'tool_objectfs'); $this->assertEquals($canoverride, tag_manager::can_overwrite_object_tags()); $object = $this->create_duplicated_object('test syncing replicated'); - - $testtags = [ - 'test' => 123, - 'test2' => 123, - 'test3' => 123, - 'test4' => 123, - ]; + $testtags = tag_manager::gather_object_tags_for_upload($object->contenthash); // Fake set the tags in the external store. $this->filesystem->get_external_client()->tags[$object->contenthash] = $testtags; @@ -1096,6 +1129,8 @@ public function test_push_object_tags_replicated(bool $canoverride) { $localtags = $DB->get_records('tool_objectfs_object_tags', ['objectid' => $object->id]); $this->assertCount(0, $localtags); + $CFG->objectfs_environment_name = $pushenv; + // Sync the file. $this->filesystem->push_object_tags($object->contenthash); @@ -1104,7 +1139,7 @@ public function test_push_object_tags_replicated(bool $canoverride) { $externaltags = $this->filesystem->get_external_client()->get_object_tags($object->contenthash); $time = $DB->get_field('tool_objectfs_objects', 'tagslastpushed', ['id' => $object->id]); - if ($canoverride) { + if ($expectedoverride) { // If can override, we expect it to be overwritten by the tags defined in the sources. $expectednum = count(tag_manager::get_defined_tag_sources()); $this->assertCount($expectednum, $localtags);