From b4ee57c6961d6f2fa08b69235ac9b8ad90bd5fcd Mon Sep 17 00:00:00 2001 From: John Davis Date: Wed, 18 Sep 2024 15:52:34 -0400 Subject: [PATCH] Username/email migration commits (squashed) Add directory for migrations data fixing scripts Add username deduplication data fixer Add migration for username column unique constraint Add email deduplication data fixer Add migration for email column unique constraint Update the model w/unique constraints Fix integration test that violated db integrity constraint Randomize test user email to respect unique constraint Fix bug in test_rule_helper Fix test_quota to respect constraint Fix test_galaxy_mapping to respect constraint --- lib/galaxy/model/__init__.py | 2 +- ...95475b58_email_column_unique_constraint.py | 52 ++++++++ ...a6168_username_column_unique_constraint.py | 51 ++++++++ .../model/migrations/data_fixes/__init__.py | 4 + .../migrations/data_fixes/user_table_fixer.py | 112 ++++++++++++++++++ lib/galaxy/model/unittest_utils/utils.py | 13 ++ .../test_celery_user_rate_limit.py | 2 +- test/unit/app/jobs/test_rule_helper.py | 2 +- test/unit/data/model/conftest.py | 17 +-- test/unit/data/test_galaxy_mapping.py | 9 +- test/unit/data/test_quota.py | 3 +- test/unit/workflows/test_run_parameters.py | 3 +- 12 files changed, 248 insertions(+), 22 deletions(-) create mode 100644 lib/galaxy/model/migrations/alembic/versions_gxy/1cf595475b58_email_column_unique_constraint.py create mode 100644 lib/galaxy/model/migrations/alembic/versions_gxy/d619fdfa6168_username_column_unique_constraint.py create mode 100644 lib/galaxy/model/migrations/data_fixes/__init__.py create mode 100644 lib/galaxy/model/migrations/data_fixes/user_table_fixer.py create mode 100644 lib/galaxy/model/unittest_utils/utils.py diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 57ca51e3948e..297af62e0471 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -776,7 +776,7 @@ class User(Base, Dictifiable, RepresentById): id: Mapped[int] = mapped_column(primary_key=True) create_time: Mapped[datetime] = mapped_column(default=now, nullable=True) update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True) - email: Mapped[str] = mapped_column(TrimmedString(255), index=True) + email: Mapped[str] = mapped_column(TrimmedString(255), index=True, unique=True) username: Mapped[Optional[str]] = mapped_column(TrimmedString(255), index=True, unique=True) password: Mapped[str] = mapped_column(TrimmedString(255)) last_password_change: Mapped[Optional[datetime]] = mapped_column(default=now) diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/1cf595475b58_email_column_unique_constraint.py b/lib/galaxy/model/migrations/alembic/versions_gxy/1cf595475b58_email_column_unique_constraint.py new file mode 100644 index 000000000000..ba356b1c770d --- /dev/null +++ b/lib/galaxy/model/migrations/alembic/versions_gxy/1cf595475b58_email_column_unique_constraint.py @@ -0,0 +1,52 @@ +"""Email column unique constraint + +Revision ID: 1cf595475b58 +Revises: d619fdfa6168 +Create Date: 2024-07-03 19:53:22.443016 +""" + +from alembic import op + +from galaxy.model.database_object_names import build_index_name +from galaxy.model.migrations.data_fixes.user_table_fixer import EmailDeduplicator +from galaxy.model.migrations.util import ( + create_index, + drop_index, + index_exists, + transaction, +) + +# revision identifiers, used by Alembic. +revision = "1cf595475b58" +down_revision = "d619fdfa6168" +branch_labels = None +depends_on = None + + +table_name = "galaxy_user" +column_name = "email" +index_name = build_index_name(table_name, [column_name]) + + +def upgrade(): + with transaction(): + _fix_duplicate_emails() + # Existing databases may have an existing index we no longer need + # New databases will not have that index, so we must check. + if index_exists(index_name, table_name, False): + drop_index(index_name, table_name) + # Create a UNIQUE index + create_index(index_name, table_name, [column_name], unique=True) + + +def downgrade(): + with transaction(): + drop_index(index_name, table_name) + # Restore a non-unique index + create_index(index_name, table_name, [column_name]) + + +def _fix_duplicate_emails(): + """Fix records with duplicate usernames""" + connection = op.get_bind() + EmailDeduplicator(connection).run() diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/d619fdfa6168_username_column_unique_constraint.py b/lib/galaxy/model/migrations/alembic/versions_gxy/d619fdfa6168_username_column_unique_constraint.py new file mode 100644 index 000000000000..de09d29097bb --- /dev/null +++ b/lib/galaxy/model/migrations/alembic/versions_gxy/d619fdfa6168_username_column_unique_constraint.py @@ -0,0 +1,51 @@ +"""Username column unique constraint + +Revision ID: d619fdfa6168 +Revises: d2d8f51ebb7e +Create Date: 2024-07-02 13:13:10.325586 +""" + +from alembic import op + +from galaxy.model.database_object_names import build_index_name +from galaxy.model.migrations.data_fixes.user_table_fixer import UsernameDeduplicator +from galaxy.model.migrations.util import ( + create_index, + drop_index, + index_exists, + transaction, +) + +# revision identifiers, used by Alembic. +revision = "d619fdfa6168" +down_revision = "d2d8f51ebb7e" +branch_labels = None +depends_on = None + +table_name = "galaxy_user" +column_name = "username" +index_name = build_index_name(table_name, [column_name]) + + +def upgrade(): + with transaction(): + _fix_duplicate_usernames() + # Existing databases may have an existing index we no longer need + # New databases will not have that index, so we must check. + if index_exists(index_name, table_name, False): + drop_index(index_name, table_name) + # Create a UNIQUE index + create_index(index_name, table_name, [column_name], unique=True) + + +def downgrade(): + with transaction(): + drop_index(index_name, table_name) + # Restore a non-unique index + create_index(index_name, table_name, [column_name]) + + +def _fix_duplicate_usernames(): + """Fix records with duplicate usernames""" + connection = op.get_bind() + UsernameDeduplicator(connection).run() diff --git a/lib/galaxy/model/migrations/data_fixes/__init__.py b/lib/galaxy/model/migrations/data_fixes/__init__.py new file mode 100644 index 000000000000..8b48aef46800 --- /dev/null +++ b/lib/galaxy/model/migrations/data_fixes/__init__.py @@ -0,0 +1,4 @@ +""" +Package contains code for fixing inconsistent data in the database that must be +run together with a migration script. +""" diff --git a/lib/galaxy/model/migrations/data_fixes/user_table_fixer.py b/lib/galaxy/model/migrations/data_fixes/user_table_fixer.py new file mode 100644 index 000000000000..4b9054872cd0 --- /dev/null +++ b/lib/galaxy/model/migrations/data_fixes/user_table_fixer.py @@ -0,0 +1,112 @@ +import uuid + +from sqlalchemy import ( + func, + Result, + select, + text, + update, +) + +from galaxy.model import User + + +class UsernameDeduplicator: + + def __init__(self, connection): + self.connection = connection + + def run(self): + """ + Deduplicate usernames by generating a unique value for all duplicates, keeping + the username of the most recently created user unchanged. + Records updated with the generated value are marked as deleted. + """ + duplicates = self._get_duplicate_username_data() + prev_username = None + for id, username, _ in duplicates: + if username == prev_username: + new_username = self._generate_next_available_username(username) + stmt = update(User).where(User.id == id).values(username=new_username, deleted=True) + self.connection.execute(stmt) + else: + prev_username = username + + def _get_duplicate_username_data(self) -> Result: + # Duplicate usernames + duplicates_stmt = select(User.username).group_by(User.username).having(func.count() > 1) + # User data for records with duplicate usernames (ordering: newest to oldest) + stmt = ( + select(User.id, User.username, User.create_time) + .where(User.username.in_(duplicates_stmt)) + .order_by(User.username, User.create_time.desc()) + ) + return self.connection.execute(stmt) + + def _generate_next_available_username(self, username): + i = 1 + while self.connection.execute(select(User).where(User.username == f"{username}-{i}")).first(): + i += 1 + return f"{username}-{i}" + + +class EmailDeduplicator: + + def __init__(self, connection): + self.connection = connection + + def run(self): + """ + Deduplicate user emails by generating a unique value for all duplicates, keeping + the email of the most recently created user that has one or more history unchanged. + If such a user does not exist, keep the oldest user. + Records updated with the generated value are marked as deleted (we presume them + to be invalid, and the user should not be able to login). + """ + stmt = select(User.email).group_by(User.email).having(func.count() > 1) + duplicate_emails = self.connection.scalars(stmt) + for email in duplicate_emails: + users = self._get_users_with_same_email(email) + user_with_history = self._find_oldest_user_with_history(users) + duplicates = self._get_users_to_deduplicate(users, user_with_history) + self._deduplicate_users(email, duplicates) + + def _get_users_with_same_email(self, email: str): + sql = text( + """ + SELECT u.id, EXISTS(SELECT h.id FROM history h WHERE h.user_id = u.id) + FROM galaxy_user u + WHERE u.email = :email + ORDER BY u.create_time + """ + ) + params = {"email": email} + return self.connection.execute(sql, params).all() + + def _find_oldest_user_with_history(self, users): + for user_id, exists in users: + if exists: + return user_id + return None + + def _get_users_to_deduplicate(self, users, user_with_history): + if user_with_history: + # Preserve the oldest user with a history + return [user_id for user_id, _ in users if user_id != user_with_history] + else: + # Preserve the oldest user + return [user_id for user_id, _ in users[1:]] + + def _deduplicate_users(self, email, to_deduplicate): + for id in to_deduplicate: + new_email = self._generate_replacement_for_duplicate_email(email) + stmt = update(User).where(User.id == id).values(email=new_email, deleted=True) + self.connection.execute(stmt) + + def _generate_replacement_for_duplicate_email(self, email: str) -> str: + """ + Generate a replacement for a duplicate email value. The new value consists of the original + email and a unique suffix. Since the original email is part of the new value, it will be + possible to retrieve the user record based on this value, if needed. + """ + return f"{email}-{uuid.uuid4()}" diff --git a/lib/galaxy/model/unittest_utils/utils.py b/lib/galaxy/model/unittest_utils/utils.py new file mode 100644 index 000000000000..c558b52b51de --- /dev/null +++ b/lib/galaxy/model/unittest_utils/utils.py @@ -0,0 +1,13 @@ +import random +import string + + +def random_str() -> str: + alphabet = string.ascii_lowercase + string.digits + size = random.randint(5, 10) + return "".join(random.choices(alphabet, k=size)) + + +def random_email() -> str: + text = random_str() + return f"{text}@galaxy.testing" diff --git a/test/integration/test_celery_user_rate_limit.py b/test/integration/test_celery_user_rate_limit.py index b9b832c5cddc..c36dacc880fd 100644 --- a/test/integration/test_celery_user_rate_limit.py +++ b/test/integration/test_celery_user_rate_limit.py @@ -50,7 +50,7 @@ def setup_users(dburl: str, num_users: int = 2): for user_id in user_ids_to_add: conn.execute( text("insert into galaxy_user(id, active, email, password) values (:id, :active, :email, :pw)"), - [{"id": user_id, "active": True, "email": "e", "pw": "p"}], + [{"id": user_id, "active": True, "email": f"e{user_id}", "pw": "p"}], ) diff --git a/test/unit/app/jobs/test_rule_helper.py b/test/unit/app/jobs/test_rule_helper.py index 1d1197a0dc78..f3be1cdd20c0 100644 --- a/test/unit/app/jobs/test_rule_helper.py +++ b/test/unit/app/jobs/test_rule_helper.py @@ -66,7 +66,7 @@ def __setup_fixtures(app): # user3 has no jobs. user1 = model.User(email=USER_EMAIL_1, password="pass1") user2 = model.User(email=USER_EMAIL_2, password="pass2") - user3 = model.User(email=USER_EMAIL_2, password="pass2") + user3 = model.User(email=USER_EMAIL_3, password="pass3") app.add(user1, user2, user3) diff --git a/test/unit/data/model/conftest.py b/test/unit/data/model/conftest.py index 034be09e51b8..38e8d3cf2ab0 100644 --- a/test/unit/data/model/conftest.py +++ b/test/unit/data/model/conftest.py @@ -1,7 +1,5 @@ import contextlib import os -import random -import string import tempfile import uuid @@ -10,6 +8,10 @@ from sqlalchemy.orm import Session from galaxy import model as m +from galaxy.model.unittest_utils.utils import ( + random_email, + random_str, +) @pytest.fixture @@ -289,17 +291,6 @@ def transaction(session): yield -def random_str() -> str: - alphabet = string.ascii_lowercase + string.digits - size = random.randint(5, 10) - return "".join(random.choices(alphabet, k=size)) - - -def random_email() -> str: - text = random_str() - return f"{text}@galaxy.testing" - - def write_to_db(session, model) -> None: with transaction(session): session.add(model) diff --git a/test/unit/data/test_galaxy_mapping.py b/test/unit/data/test_galaxy_mapping.py index 1a88a82f78ed..9acf3db807c8 100644 --- a/test/unit/data/test_galaxy_mapping.py +++ b/test/unit/data/test_galaxy_mapping.py @@ -23,6 +23,7 @@ get_object_session, ) from galaxy.model.security import GalaxyRBACAgent +from galaxy.model.unittest_utils.utils import random_email from galaxy.objectstore import QuotaSourceMap from galaxy.util.unittest import TestCase @@ -262,7 +263,7 @@ def test_collection_get_interface(self): assert c1[i] == dces[i] def test_dataset_instance_order(self) -> None: - u = model.User(email="mary@example.com", password="password") + u = model.User(email=random_email(), password="password") h1 = model.History(name="History 1", user=u) elements = [] list_pair = model.DatasetCollection(collection_type="list:paired") @@ -669,7 +670,7 @@ def contents_iter_names(**kwds): assert contents_iter_names(ids=[d1.id, d3.id]) == ["1", "3"] def test_history_audit(self): - u = model.User(email="contents@foo.bar.baz", password="password") + u = model.User(email=random_email(), password="password") h1 = model.History(name="HistoryAuditHistory", user=u) h2 = model.History(name="HistoryAuditHistory", user=u) @@ -739,7 +740,7 @@ def test_flush_refreshes(self): # states and flushing in SQL Alchemy is very subtle and it is good to have a executable # reference for how it behaves in the context of Galaxy objects. model = self.model - user = model.User(email="testworkflows@bx.psu.edu", password="password") + user = model.User(email=random_email(), password="password") galaxy_session = model.GalaxySession() galaxy_session_other = model.GalaxySession() galaxy_session.user = user @@ -812,7 +813,7 @@ def test_flush_refreshes(self): assert "id" not in inspect(galaxy_model_object_new).unloaded def test_workflows(self): - user = model.User(email="testworkflows@bx.psu.edu", password="password") + user = model.User(email=random_email(), password="password") child_workflow = _workflow_from_steps(user, []) self.persist(child_workflow) diff --git a/test/unit/data/test_quota.py b/test/unit/data/test_quota.py index 5c34ea7cb056..7a3695894445 100644 --- a/test/unit/data/test_quota.py +++ b/test/unit/data/test_quota.py @@ -1,6 +1,7 @@ import uuid from galaxy import model +from galaxy.model.unittest_utils.utils import random_email from galaxy.objectstore import ( QuotaSourceInfo, QuotaSourceMap, @@ -16,7 +17,7 @@ class TestPurgeUsage(BaseModelTestCase): def setUp(self): super().setUp() model = self.model - u = model.User(email="purge_usage@example.com", password="password") + u = model.User(email=random_email(), password="password") u.disk_usage = 25 self.persist(u) diff --git a/test/unit/workflows/test_run_parameters.py b/test/unit/workflows/test_run_parameters.py index 76eae8955744..5718ac923a40 100644 --- a/test/unit/workflows/test_run_parameters.py +++ b/test/unit/workflows/test_run_parameters.py @@ -1,5 +1,6 @@ from galaxy import model from galaxy.model.base import transaction +from galaxy.model.unittest_utils.utils import random_email from galaxy.workflow.run_request import ( _normalize_inputs, _normalize_step_parameters, @@ -89,7 +90,7 @@ def __new_input(): def __workflow_fixure(trans): - user = model.User(email="testworkflow_params@bx.psu.edu", password="pass") + user = model.User(email=random_email(), password="pass") stored_workflow = model.StoredWorkflow() stored_workflow.user = user workflow = model.Workflow()