Skip to content

Commit

Permalink
Username/email migration commits (squashed)
Browse files Browse the repository at this point in the history
Add directory for migrations data fixing scripts

Add username deduplication data fixer

Add migration for username column unique constraint

Add email deduplication data fixer

Add migration for email column unique constraint

Update the model w/unique constraints

Fix integration test that violated db integrity constraint

Randomize test user email to respect unique constraint

Fix bug in test_rule_helper

Fix test_quota to respect constraint

Fix test_galaxy_mapping to respect constraint
  • Loading branch information
jdavcs committed Sep 30, 2024
1 parent b4f069a commit b4ee57c
Show file tree
Hide file tree
Showing 12 changed files with 248 additions and 22 deletions.
2 changes: 1 addition & 1 deletion lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ class User(Base, Dictifiable, RepresentById):
id: Mapped[int] = mapped_column(primary_key=True)
create_time: Mapped[datetime] = mapped_column(default=now, nullable=True)
update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True)
email: Mapped[str] = mapped_column(TrimmedString(255), index=True)
email: Mapped[str] = mapped_column(TrimmedString(255), index=True, unique=True)
username: Mapped[Optional[str]] = mapped_column(TrimmedString(255), index=True, unique=True)
password: Mapped[str] = mapped_column(TrimmedString(255))
last_password_change: Mapped[Optional[datetime]] = mapped_column(default=now)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Email column unique constraint
Revision ID: 1cf595475b58
Revises: d619fdfa6168
Create Date: 2024-07-03 19:53:22.443016
"""

from alembic import op

from galaxy.model.database_object_names import build_index_name
from galaxy.model.migrations.data_fixes.user_table_fixer import EmailDeduplicator
from galaxy.model.migrations.util import (
create_index,
drop_index,
index_exists,
transaction,
)

# revision identifiers, used by Alembic.
revision = "1cf595475b58"
down_revision = "d619fdfa6168"
branch_labels = None
depends_on = None


table_name = "galaxy_user"
column_name = "email"
index_name = build_index_name(table_name, [column_name])


def upgrade():
with transaction():
_fix_duplicate_emails()
# Existing databases may have an existing index we no longer need
# New databases will not have that index, so we must check.
if index_exists(index_name, table_name, False):
drop_index(index_name, table_name)
# Create a UNIQUE index
create_index(index_name, table_name, [column_name], unique=True)


def downgrade():
with transaction():
drop_index(index_name, table_name)
# Restore a non-unique index
create_index(index_name, table_name, [column_name])


def _fix_duplicate_emails():
"""Fix records with duplicate usernames"""
connection = op.get_bind()
EmailDeduplicator(connection).run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Username column unique constraint
Revision ID: d619fdfa6168
Revises: d2d8f51ebb7e
Create Date: 2024-07-02 13:13:10.325586
"""

from alembic import op

from galaxy.model.database_object_names import build_index_name
from galaxy.model.migrations.data_fixes.user_table_fixer import UsernameDeduplicator
from galaxy.model.migrations.util import (
create_index,
drop_index,
index_exists,
transaction,
)

# revision identifiers, used by Alembic.
revision = "d619fdfa6168"
down_revision = "d2d8f51ebb7e"
branch_labels = None
depends_on = None

table_name = "galaxy_user"
column_name = "username"
index_name = build_index_name(table_name, [column_name])


def upgrade():
with transaction():
_fix_duplicate_usernames()
# Existing databases may have an existing index we no longer need
# New databases will not have that index, so we must check.
if index_exists(index_name, table_name, False):
drop_index(index_name, table_name)
# Create a UNIQUE index
create_index(index_name, table_name, [column_name], unique=True)


def downgrade():
with transaction():
drop_index(index_name, table_name)
# Restore a non-unique index
create_index(index_name, table_name, [column_name])


def _fix_duplicate_usernames():
"""Fix records with duplicate usernames"""
connection = op.get_bind()
UsernameDeduplicator(connection).run()
4 changes: 4 additions & 0 deletions lib/galaxy/model/migrations/data_fixes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
Package contains code for fixing inconsistent data in the database that must be
run together with a migration script.
"""
112 changes: 112 additions & 0 deletions lib/galaxy/model/migrations/data_fixes/user_table_fixer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import uuid

from sqlalchemy import (
func,
Result,
select,
text,
update,
)

from galaxy.model import User


class UsernameDeduplicator:

def __init__(self, connection):
self.connection = connection

def run(self):
"""
Deduplicate usernames by generating a unique value for all duplicates, keeping
the username of the most recently created user unchanged.
Records updated with the generated value are marked as deleted.
"""
duplicates = self._get_duplicate_username_data()
prev_username = None
for id, username, _ in duplicates:
if username == prev_username:
new_username = self._generate_next_available_username(username)
stmt = update(User).where(User.id == id).values(username=new_username, deleted=True)
self.connection.execute(stmt)
else:
prev_username = username

def _get_duplicate_username_data(self) -> Result:
# Duplicate usernames
duplicates_stmt = select(User.username).group_by(User.username).having(func.count() > 1)
# User data for records with duplicate usernames (ordering: newest to oldest)
stmt = (
select(User.id, User.username, User.create_time)
.where(User.username.in_(duplicates_stmt))
.order_by(User.username, User.create_time.desc())
)
return self.connection.execute(stmt)

def _generate_next_available_username(self, username):
i = 1
while self.connection.execute(select(User).where(User.username == f"{username}-{i}")).first():
i += 1
return f"{username}-{i}"


class EmailDeduplicator:

def __init__(self, connection):
self.connection = connection

def run(self):
"""
Deduplicate user emails by generating a unique value for all duplicates, keeping
the email of the most recently created user that has one or more history unchanged.
If such a user does not exist, keep the oldest user.
Records updated with the generated value are marked as deleted (we presume them
to be invalid, and the user should not be able to login).
"""
stmt = select(User.email).group_by(User.email).having(func.count() > 1)
duplicate_emails = self.connection.scalars(stmt)
for email in duplicate_emails:
users = self._get_users_with_same_email(email)
user_with_history = self._find_oldest_user_with_history(users)
duplicates = self._get_users_to_deduplicate(users, user_with_history)
self._deduplicate_users(email, duplicates)

def _get_users_with_same_email(self, email: str):
sql = text(
"""
SELECT u.id, EXISTS(SELECT h.id FROM history h WHERE h.user_id = u.id)
FROM galaxy_user u
WHERE u.email = :email
ORDER BY u.create_time
"""
)
params = {"email": email}
return self.connection.execute(sql, params).all()

def _find_oldest_user_with_history(self, users):
for user_id, exists in users:
if exists:
return user_id
return None

def _get_users_to_deduplicate(self, users, user_with_history):
if user_with_history:
# Preserve the oldest user with a history
return [user_id for user_id, _ in users if user_id != user_with_history]
else:
# Preserve the oldest user
return [user_id for user_id, _ in users[1:]]

def _deduplicate_users(self, email, to_deduplicate):
for id in to_deduplicate:
new_email = self._generate_replacement_for_duplicate_email(email)
stmt = update(User).where(User.id == id).values(email=new_email, deleted=True)
self.connection.execute(stmt)

def _generate_replacement_for_duplicate_email(self, email: str) -> str:
"""
Generate a replacement for a duplicate email value. The new value consists of the original
email and a unique suffix. Since the original email is part of the new value, it will be
possible to retrieve the user record based on this value, if needed.
"""
return f"{email}-{uuid.uuid4()}"
13 changes: 13 additions & 0 deletions lib/galaxy/model/unittest_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import random
import string


def random_str() -> str:
alphabet = string.ascii_lowercase + string.digits
size = random.randint(5, 10)
return "".join(random.choices(alphabet, k=size))


def random_email() -> str:
text = random_str()
return f"{text}@galaxy.testing"
2 changes: 1 addition & 1 deletion test/integration/test_celery_user_rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setup_users(dburl: str, num_users: int = 2):
for user_id in user_ids_to_add:
conn.execute(
text("insert into galaxy_user(id, active, email, password) values (:id, :active, :email, :pw)"),
[{"id": user_id, "active": True, "email": "e", "pw": "p"}],
[{"id": user_id, "active": True, "email": f"e{user_id}", "pw": "p"}],
)


Expand Down
2 changes: 1 addition & 1 deletion test/unit/app/jobs/test_rule_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __setup_fixtures(app):
# user3 has no jobs.
user1 = model.User(email=USER_EMAIL_1, password="pass1")
user2 = model.User(email=USER_EMAIL_2, password="pass2")
user3 = model.User(email=USER_EMAIL_2, password="pass2")
user3 = model.User(email=USER_EMAIL_3, password="pass3")

app.add(user1, user2, user3)

Expand Down
17 changes: 4 additions & 13 deletions test/unit/data/model/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import contextlib
import os
import random
import string
import tempfile
import uuid

Expand All @@ -10,6 +8,10 @@
from sqlalchemy.orm import Session

from galaxy import model as m
from galaxy.model.unittest_utils.utils import (
random_email,
random_str,
)


@pytest.fixture
Expand Down Expand Up @@ -289,17 +291,6 @@ def transaction(session):
yield


def random_str() -> str:
alphabet = string.ascii_lowercase + string.digits
size = random.randint(5, 10)
return "".join(random.choices(alphabet, k=size))


def random_email() -> str:
text = random_str()
return f"{text}@galaxy.testing"


def write_to_db(session, model) -> None:
with transaction(session):
session.add(model)
Expand Down
9 changes: 5 additions & 4 deletions test/unit/data/test_galaxy_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
get_object_session,
)
from galaxy.model.security import GalaxyRBACAgent
from galaxy.model.unittest_utils.utils import random_email
from galaxy.objectstore import QuotaSourceMap
from galaxy.util.unittest import TestCase

Expand Down Expand Up @@ -262,7 +263,7 @@ def test_collection_get_interface(self):
assert c1[i] == dces[i]

def test_dataset_instance_order(self) -> None:
u = model.User(email="[email protected]", password="password")
u = model.User(email=random_email(), password="password")
h1 = model.History(name="History 1", user=u)
elements = []
list_pair = model.DatasetCollection(collection_type="list:paired")
Expand Down Expand Up @@ -669,7 +670,7 @@ def contents_iter_names(**kwds):
assert contents_iter_names(ids=[d1.id, d3.id]) == ["1", "3"]

def test_history_audit(self):
u = model.User(email="[email protected]", password="password")
u = model.User(email=random_email(), password="password")
h1 = model.History(name="HistoryAuditHistory", user=u)
h2 = model.History(name="HistoryAuditHistory", user=u)

Expand Down Expand Up @@ -739,7 +740,7 @@ def test_flush_refreshes(self):
# states and flushing in SQL Alchemy is very subtle and it is good to have a executable
# reference for how it behaves in the context of Galaxy objects.
model = self.model
user = model.User(email="[email protected]", password="password")
user = model.User(email=random_email(), password="password")
galaxy_session = model.GalaxySession()
galaxy_session_other = model.GalaxySession()
galaxy_session.user = user
Expand Down Expand Up @@ -812,7 +813,7 @@ def test_flush_refreshes(self):
assert "id" not in inspect(galaxy_model_object_new).unloaded

def test_workflows(self):
user = model.User(email="[email protected]", password="password")
user = model.User(email=random_email(), password="password")

child_workflow = _workflow_from_steps(user, [])
self.persist(child_workflow)
Expand Down
3 changes: 2 additions & 1 deletion test/unit/data/test_quota.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import uuid

from galaxy import model
from galaxy.model.unittest_utils.utils import random_email
from galaxy.objectstore import (
QuotaSourceInfo,
QuotaSourceMap,
Expand All @@ -16,7 +17,7 @@ class TestPurgeUsage(BaseModelTestCase):
def setUp(self):
super().setUp()
model = self.model
u = model.User(email="[email protected]", password="password")
u = model.User(email=random_email(), password="password")
u.disk_usage = 25
self.persist(u)

Expand Down
3 changes: 2 additions & 1 deletion test/unit/workflows/test_run_parameters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from galaxy import model
from galaxy.model.base import transaction
from galaxy.model.unittest_utils.utils import random_email
from galaxy.workflow.run_request import (
_normalize_inputs,
_normalize_step_parameters,
Expand Down Expand Up @@ -89,7 +90,7 @@ def __new_input():


def __workflow_fixure(trans):
user = model.User(email="[email protected]", password="pass")
user = model.User(email=random_email(), password="pass")
stored_workflow = model.StoredWorkflow()
stored_workflow.user = user
workflow = model.Workflow()
Expand Down

0 comments on commit b4ee57c

Please sign in to comment.