From 75e160d9894ec829d7efab3380a492ef4a85c3e1 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 20 Oct 2023 16:29:04 +0000 Subject: [PATCH 01/17] parallel-workload: Less locking --- ci/nightly/pipeline.template.yml | 12 +- .../materialize/parallel_workload/action.py | 276 +++++++++++------- .../materialize/parallel_workload/database.py | 44 +++ .../parallel_workload/parallel_workload.py | 27 +- .../materialize/parallel_workload/settings.py | 1 + 5 files changed, 231 insertions(+), 129 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index e4fbb28ec216b..ba7813703d038 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -871,7 +871,17 @@ steps: - ./ci/plugins/mzcompose: composition: parallel-workload args: [--runtime=1500, --scenario=kill] - skip: "TODO(def-) Enable after figuring out restoring catalog" + + - id: parallel-workload-backup-restore + label: "Parallel Workload (backup & restore)" + artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] + timeout_in_minutes: 40 + agents: + queue: builder-linux-x86_64 + plugins: + - ./ci/plugins/mzcompose: + composition: parallel-workload + args: [--runtime=1500, --scenario=backup-restore] - id: incident-70 label: "Test for incident 70" diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 6de7c2c586d30..7b3a045c17b0e 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -231,9 +231,9 @@ def run(self, exe: Executor) -> None: if not sources: return source = self.rng.choice(sources) + with source.lock: transaction = next(source.generator) - with source.lock: - source.executor.run(transaction) + source.executor.run(transaction) class UpdateAction(Action): @@ -341,14 +341,13 @@ def run(self, exe: Executor) -> None: class CreateTableAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.tables) > MAX_TABLES: - return - table_id = self.db.table_id - self.db.table_id += 1 - table = Table(self.rng, table_id, self.rng.choice(self.db.schemas)) - table.create(exe) - self.db.tables.append(table) + if len(self.db.tables) > MAX_TABLES: + return + table_id = self.db.table_id + self.db.table_id += 1 + table = Table(self.rng, table_id, self.rng.choice(self.db.schemas)) + table.create(exe) + self.db.tables.append(table) class DropTableAction(Action): @@ -361,11 +360,10 @@ def run(self, exe: Executor) -> None: with self.db.lock: if len(self.db.tables) <= 2: return - table_id = self.rng.randrange(len(self.db.tables)) - table = self.db.tables[table_id] + table = self.rng.choice(self.db.tables) query = f"DROP TABLE {table}" exe.execute(query) - del self.db.tables[table_id] + self.db.tables.remove(table) class RenameTableAction(Action): @@ -376,17 +374,51 @@ def run(self, exe: Executor) -> None: if not self.db.tables: return table = self.rng.choice(self.db.tables) - old_name = str(table) - table.rename += 1 + old_name = str(table) + table.rename += 1 + try: + exe.execute(f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}") + except: + table.rename -= 1 + raise + + +class RenameViewAction(Action): + def run(self, exe: Executor) -> None: + if self.db.scenario != Scenario.Rename: + return + with self.db.lock: + if not self.db.views: + return + view = self.rng.choice(self.db.views) + old_name = str(view) + view.rename += 1 try: exe.execute( - f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}" + f"ALTER {'MATERIALIZED VIEW' if view.materialized else 'VIEW'} {old_name} RENAME TO {identifier(view.name())}" ) except: - table.rename -= 1 + view.rename -= 1 raise +class RenameSinkAction(Action): + def run(self, exe: Executor) -> None: + if self.db.scenario != Scenario.Rename: + return + with self.db.lock: + if not self.db.kafka_sinks: + return + sink = self.rng.choice(self.db.kafka_sinks) + old_name = str(sink) + sink.rename += 1 + try: + exe.execute(f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}") + except: + sink.rename -= 1 + raise + + class CreateSchemaAction(Action): def run(self, exe: Executor) -> None: with self.db.lock: @@ -394,9 +426,9 @@ def run(self, exe: Executor) -> None: return schema_id = self.db.schema_id self.db.schema_id += 1 - schema = Schema(self.rng, schema_id) - schema.create(exe) - self.db.schemas.append(schema) + schema = Schema(self.rng, schema_id) + schema.create(exe) + self.db.schemas.append(schema) class DropSchemaAction(Action): @@ -422,13 +454,13 @@ def run(self, exe: Executor) -> None: return with self.db.lock: schema = self.rng.choice(self.db.schemas) - old_name = str(schema) - schema.rename += 1 - try: - exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}") - except: - schema.rename -= 1 - raise + old_name = str(schema) + schema.rename += 1 + try: + exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}") + except: + schema.rename -= 1 + raise class SwapSchemaAction(Action): @@ -476,21 +508,21 @@ def run(self, exe: Executor) -> None: return view_id = self.db.view_id self.db.view_id += 1 - # Don't use views for now since LIMIT 1 and statement_timeout are - # not effective yet at preventing long-running queries and OoMs. - base_object = self.rng.choice(self.db.db_objects()) - base_object2: DBObject | None = self.rng.choice(self.db.db_objects()) - if self.rng.choice([True, False]) or base_object2 == base_object: - base_object2 = None - view = View( - self.rng, - view_id, - base_object, - base_object2, - self.rng.choice(self.db.schemas), - ) - view.create(exe) - self.db.views.append(view) + # Don't use views for now since LIMIT 1 and statement_timeout are + # not effective yet at preventing long-running queries and OoMs. + base_object = self.rng.choice(self.db.db_objects()) + base_object2: DBObject | None = self.rng.choice(self.db.db_objects()) + if self.rng.choice([True, False]) or base_object2 == base_object: + base_object2 = None + view = View( + self.rng, + view_id, + base_object, + base_object2, + self.rng.choice(self.db.schemas), + ) + view.create(exe) + self.db.views.append(view) class DropViewAction(Action): @@ -520,9 +552,9 @@ def run(self, exe: Executor) -> None: return role_id = self.db.role_id self.db.role_id += 1 - role = Role(role_id) - role.create(exe) - self.db.roles.append(role) + role = Role(role_id) + role.create(exe) + self.db.roles.append(role) class DropRoleAction(Action): @@ -550,15 +582,15 @@ def run(self, exe: Executor) -> None: return cluster_id = self.db.cluster_id self.db.cluster_id += 1 - cluster = Cluster( - cluster_id, - managed=self.rng.choice([True, False]), - size=self.rng.choice(["1", "2", "4"]), - replication_factor=self.rng.choice([1, 2, 4, 5]), - introspection_interval=self.rng.choice(["0", "1s", "10s"]), - ) - cluster.create(exe) - self.db.clusters.append(cluster) + cluster = Cluster( + cluster_id, + managed=self.rng.choice([True, False]), + size=self.rng.choice(["1", "2", "4"]), + replication_factor=self.rng.choice([1, 2, 4, 5]), + introspection_interval=self.rng.choice(["0", "1s", "10s"]), + ) + cluster.create(exe) + self.db.clusters.append(cluster) class DropClusterAction(Action): @@ -591,16 +623,20 @@ def run(self, exe: Executor) -> None: if not self.db.clusters: return cluster = self.rng.choice(self.db.clusters) - query = f"SET CLUSTER = {cluster}" - exe.execute(query) + query = f"SET CLUSTER = {cluster}" + exe.execute(query) class CreateClusterReplicaAction(Action): def errors_to_ignore(self) -> list[str]: - return [ - "cannot create more than one replica of a cluster containing sources or sinks" + result = [ + "cannot create more than one replica of a cluster containing sources or sinks", + # Can happen with reduced locking + "cannot create multiple replicas named", ] + super().errors_to_ignore() + return result + def run(self, exe: Executor) -> None: with self.db.lock: # Keep cluster 0 with 1 replica for sources/sinks @@ -615,9 +651,13 @@ def run(self, exe: Executor) -> None: size=self.rng.choice(["1", "2", "4"]), cluster=cluster, ) + cluster.replica_id += 1 + try: replica.create(exe) cluster.replicas.append(replica) - cluster.replica_id += 1 + except: + cluster.replica_id -= 1 + raise class DropClusterReplicaAction(Action): @@ -631,11 +671,10 @@ def run(self, exe: Executor) -> None: # Avoid "has no replicas available to service request" error if len(cluster.replicas) <= 1: return - replica_id = self.rng.randrange(len(cluster.replicas)) - replica = cluster.replicas[replica_id] + replica = self.rng.choice(cluster.replicas) query = f"DROP CLUSTER REPLICA {cluster}.{replica}" exe.execute(query) - del cluster.replicas[replica_id] + cluster.replicas.remove(replica) class GrantPrivilegesAction(Action): @@ -644,11 +683,11 @@ def run(self, exe: Executor) -> None: if not self.db.roles: return role = self.rng.choice(self.db.roles) - privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) - tables_views: list[DBObject] = [*self.db.tables, *self.db.views] - table = self.rng.choice(tables_views) - query = f"GRANT {privilege} ON {table} TO {role}" - exe.execute(query) + privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) + tables_views: list[DBObject] = [*self.db.tables, *self.db.views] + table = self.rng.choice(tables_views) + query = f"GRANT {privilege} ON {table} TO {role}" + exe.execute(query) class RevokePrivilegesAction(Action): @@ -657,11 +696,11 @@ def run(self, exe: Executor) -> None: if not self.db.roles: return role = self.rng.choice(self.db.roles) - privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) - tables_views: list[DBObject] = [*self.db.tables, *self.db.views] - table = self.rng.choice(tables_views) - query = f"REVOKE {privilege} ON {table} FROM {role}" - exe.execute(query) + privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) + tables_views: list[DBObject] = [*self.db.tables, *self.db.views] + table = self.rng.choice(tables_views) + query = f"REVOKE {privilege} ON {table} FROM {role}" + exe.execute(query) # TODO: Should factor this out so can easily use it without action @@ -774,7 +813,7 @@ def run(self, exe: Executor) -> None: # Otherwise getting failure on "up" locally time.sleep(1) self.composition.up("materialized", detach=True) - time.sleep(self.rng.uniform(20, 60)) + time.sleep(self.rng.uniform(20, 180)) class CreateWebhookSourceAction(Action): @@ -784,12 +823,12 @@ def run(self, exe: Executor) -> None: return webhook_source_id = self.db.webhook_source_id self.db.webhook_source_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] - cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(self.db.schemas) - source = WebhookSource(webhook_source_id, cluster, schema, self.rng) - source.create(exe) - self.db.webhook_sources.append(source) + potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] + cluster = self.rng.choice(potential_clusters) + schema = self.rng.choice(self.db.schemas) + source = WebhookSource(webhook_source_id, cluster, schema, self.rng) + source.create(exe) + self.db.webhook_sources.append(source) class DropWebhookSourceAction(Action): @@ -816,14 +855,18 @@ def run(self, exe: Executor) -> None: return source_id = self.db.kafka_source_id self.db.kafka_source_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] - cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(self.db.schemas) + potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] + cluster = self.rng.choice(potential_clusters) + schema = self.rng.choice(self.db.schemas) + try: source = KafkaSource( self.db.name(), source_id, cluster, schema, self.db.ports, self.rng ) source.create(exe) self.db.kafka_sources.append(source) + except: + if self.db.scenario != Scenario.Kill: + raise class DropKafkaSourceAction(Action): @@ -853,11 +896,15 @@ def run(self, exe: Executor) -> None: potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] schema = self.rng.choice(self.db.schemas) cluster = self.rng.choice(potential_clusters) + try: source = PostgresSource( self.db.name(), source_id, cluster, schema, self.db.ports, self.rng ) source.create(exe) self.db.postgres_sources.append(source) + except: + if self.db.scenario != Scenario.Kill: + raise class DropPostgresSourceAction(Action): @@ -878,6 +925,12 @@ def run(self, exe: Executor) -> None: class CreateKafkaSinkAction(Action): + def errors_to_ignore(self) -> list[str]: + return [ + # Another replica can be created in parallel + "cannot create sink in cluster with more than one replica", + ] + super().errors_to_ignore() + def run(self, exe: Executor) -> None: with self.db.lock: if len(self.db.kafka_sinks) > MAX_KAFKA_SINKS: @@ -887,15 +940,15 @@ def run(self, exe: Executor) -> None: potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] cluster = self.rng.choice(potential_clusters) schema = self.rng.choice(self.db.schemas) - sink = KafkaSink( - sink_id, - cluster, - schema, - self.rng.choice(self.db.db_objects_without_views()), - self.rng, - ) - sink.create(exe) - self.db.kafka_sinks.append(sink) + sink = KafkaSink( + sink_id, + cluster, + schema, + self.rng.choice(self.db.db_objects_without_views()), + self.rng, + ) + sink.create(exe) + self.db.kafka_sinks.append(sink) class DropKafkaSinkAction(Action): @@ -922,26 +975,31 @@ def run(self, exe: Executor) -> None: return source = self.rng.choice(self.db.webhook_sources) - url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}" + url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}" - payload = source.body_format.to_data_type().random_value(self.rng) + payload = source.body_format.to_data_type().random_value(self.rng) - header_fields = source.explicit_include_headers - if source.include_headers: - header_fields.extend( - ["timestamp", "x-event-type", "signature", "x-mz-api-key"] - ) + header_fields = source.explicit_include_headers + if source.include_headers: + header_fields.extend( + ["timestamp", "x-event-type", "signature", "x-mz-api-key"] + ) - headers = { - header: f'"{Text.random_value(self.rng)}"'.encode() - for header in self.rng.sample(header_fields, len(header_fields)) - } + headers = { + header: f'"{Text.random_value(self.rng)}"'.encode() + for header in self.rng.sample(header_fields, len(header_fields)) + } - headers_strs = [f"{key}: {value}" for key, value in enumerate(headers)] - exe.log( - f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}" - ) + headers_strs = [f"{key}: {value}" for key, value in enumerate(headers)] + exe.log( + f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}" + ) + try: requests.post(url, data=payload.encode("utf-8"), headers=headers) + except requests.exceptions.ConnectionError: + # Expeceted when Mz is killed + if self.db.scenario != Scenario.Kill: + raise class ActionList: @@ -1012,8 +1070,8 @@ def __init__( (DropRoleAction, 1), (CreateClusterAction, 2), (DropClusterAction, 1), - (CreateClusterReplicaAction, 8), - (DropClusterReplicaAction, 4), + (CreateClusterReplicaAction, 4), + (DropClusterReplicaAction, 2), (SetClusterAction, 1), (CreateWebhookSourceAction, 2), (DropWebhookSourceAction, 1), @@ -1030,6 +1088,8 @@ def __init__( (DropSchemaAction, 1), (RenameSchemaAction, 10), (RenameTableAction, 10), + (RenameViewAction, 10), + (RenameSinkAction, 10), (SwapSchemaAction, 10), # (TransactionIsolationAction, 1), ], diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index 35aa59f3f33dd..e3e5fb8853548 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -210,6 +210,7 @@ class View(DBObject): join_column: Column | None join_column2: Column | None assert_not_null: list[Column] + rename: int schema: Schema def __init__( @@ -220,6 +221,7 @@ def __init__( base_object2: DBObject | None, schema: Schema, ): + self.rename = 0 self.view_id = view_id self.base_object = base_object self.base_object2 = base_object2 @@ -262,6 +264,8 @@ def __init__( self.join_column2 = rng.choice(columns) def name(self) -> str: + if self.rename: + return naughtify(f"v-{self.view_id}-{self.rename}") return naughtify(f"v-{self.view_id}") def __str__(self) -> str: @@ -578,13 +582,17 @@ class ClusterReplica: replica_id: int size: str cluster: "Cluster" + rename: int def __init__(self, replica_id: int, size: str, cluster: "Cluster"): self.replica_id = replica_id self.size = size self.cluster = cluster + self.rename = 0 def name(self) -> str: + if self.rename: + return naughtify(f"r-{self.replica_id+1}-{self.rename}") return naughtify(f"r-{self.replica_id+1}") def __str__(self) -> str: @@ -604,6 +612,7 @@ class Cluster: replicas: list[ClusterReplica] replica_id: int introspection_interval: str + rename: int def __init__( self, @@ -621,8 +630,11 @@ def __init__( ] self.replica_id = len(self.replicas) self.introspection_interval = introspection_interval + self.rename = 0 def name(self) -> str: + if self.rename: + return naughtify(f"cluster-{self.cluster_id}-{self.rename}") return naughtify(f"cluster-{self.cluster_id}") def __str__(self) -> str: @@ -798,7 +810,39 @@ def drop(self, exe: Executor) -> None: def create(self, exe: Executor) -> None: self.drop(exe) + exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true") + exe.execute("ALTER SYSTEM SET max_schemas_per_database = 105") + # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES + exe.execute("ALTER SYSTEM SET max_tables = 200") + exe.execute("ALTER SYSTEM SET max_materialized_views = 105") + exe.execute("ALTER SYSTEM SET max_sources = 105") + exe.execute("ALTER SYSTEM SET max_roles = 105") + exe.execute("ALTER SYSTEM SET max_clusters = 105") + exe.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105") + # Most queries should not fail because of privileges + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC" + ) + exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC" + ) exe.execute(f"CREATE DATABASE {self}") + exe.execute(f"ALTER DATABASE {self} OWNER TO materialize") def create_relations(self, exe: Executor) -> None: exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index b37033df48088..0895e107aaa23 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -58,22 +58,6 @@ def run( ) initialize_logging() - system_conn = pg8000.connect( - host=host, port=ports["mz_system"], user="mz_system", database="materialize" - ) - system_conn.autocommit = True - with system_conn.cursor() as cur: - cur.execute("ALTER SYSTEM SET enable_webhook_sources TO true") - cur.execute("ALTER SYSTEM SET max_schemas_per_database = 105") - # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES - cur.execute("ALTER SYSTEM SET max_tables = 200") - cur.execute("ALTER SYSTEM SET max_materialized_views = 105") - cur.execute("ALTER SYSTEM SET max_sources = 105") - cur.execute("ALTER SYSTEM SET max_roles = 105") - cur.execute("ALTER SYSTEM SET max_clusters = 105") - cur.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105") - system_conn.close() - end_time = ( datetime.datetime.now() + datetime.timedelta(seconds=runtime) ).timestamp() @@ -82,11 +66,14 @@ def run( database = Database( rng, seed, host, ports, complexity, scenario, naughty_identifiers ) - conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") - conn.autocommit = True - with conn.cursor() as cur: + + system_conn = pg8000.connect( + host=host, port=ports["mz_system"], user="mz_system", database="materialize" + ) + system_conn.autocommit = True + with system_conn.cursor() as cur: database.create(Executor(rng, cur)) - conn.close() + system_conn.close() conn = pg8000.connect( host=host, diff --git a/misc/python/materialize/parallel_workload/settings.py b/misc/python/materialize/parallel_workload/settings.py index 67411f6baabea..7e61888ad499f 100644 --- a/misc/python/materialize/parallel_workload/settings.py +++ b/misc/python/materialize/parallel_workload/settings.py @@ -21,3 +21,4 @@ class Scenario(Enum): Cancel = "cancel" Kill = "kill" Rename = "rename" + BackupRestore = "backup-restore" From 73be019e76279f5018600b293137004658356cfe Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Mon, 23 Oct 2023 07:44:23 +0000 Subject: [PATCH 02/17] parallel-workload: Handle drops and other errors in kill scenario --- .../materialize/parallel_workload/action.py | 88 +++++++++++++++---- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 7b3a045c17b0e..3fe6e57386fa4 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -49,11 +49,6 @@ if TYPE_CHECKING: from materialize.parallel_workload.worker import Worker -# TODO: In kill scenario drops can be successful, but we might never know, see -# https://github.com/MaterializeInc/materialize/issues/20465 We should handle -# this by rescanning objects we expect to be there and removing the ones that -# were dropped. This also has the risk that objects get lost as a bug though. - # TODO: CASCADE in DROPs, keep track of what will be deleted class Action: rng: random.Random @@ -335,7 +330,12 @@ def run(self, exe: Executor) -> None: return index_name = self.rng.choice(list(self.db.indexes)) query = f"DROP INDEX {identifier(index_name)}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e self.db.indexes.remove(index_name) @@ -362,7 +362,12 @@ def run(self, exe: Executor) -> None: return table = self.rng.choice(self.db.tables) query = f"DROP TABLE {table}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e self.db.tables.remove(table) @@ -444,11 +449,21 @@ def run(self, exe: Executor) -> None: schema_id = self.rng.randrange(len(self.db.schemas)) schema = self.db.schemas[schema_id] query = f"DROP SCHEMA {schema}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown schema" not in e.msg: + raise e del self.db.schemas[schema_id] class RenameSchemaAction(Action): + def errors_to_ignore(self) -> list[str]: + return [ + "ambiguous reference to schema named" # see https://github.com/MaterializeInc/materialize/pull/22551#pullrequestreview-1691876923 + ] + super().errors_to_ignore() + def run(self, exe: Executor) -> None: if self.db.scenario != Scenario.Rename: return @@ -541,7 +556,12 @@ def run(self, exe: Executor) -> None: query = f"DROP MATERIALIZED VIEW {view}" else: query = f"DROP VIEW {view}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e del self.db.views[view_id] @@ -571,7 +591,12 @@ def run(self, exe: Executor) -> None: role_id = self.rng.randrange(len(self.db.roles)) role = self.db.roles[role_id] query = f"DROP ROLE {role}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown role" not in e.msg: + raise e del self.db.roles[role_id] @@ -608,7 +633,12 @@ def run(self, exe: Executor) -> None: cluster_id = self.rng.randrange(1, len(self.db.clusters)) cluster = self.db.clusters[cluster_id] query = f"DROP CLUSTER {cluster}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg: + raise e del self.db.clusters[cluster_id] @@ -673,10 +703,14 @@ def run(self, exe: Executor) -> None: return replica = self.rng.choice(cluster.replicas) query = f"DROP CLUSTER REPLICA {cluster}.{replica}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "has no CLUSTER REPLICA named" not in e.msg: + raise e cluster.replicas.remove(replica) - class GrantPrivilegesAction(Action): def run(self, exe: Executor) -> None: with self.db.lock: @@ -844,7 +878,12 @@ def run(self, exe: Executor) -> None: source_id = self.rng.randrange(len(self.db.webhook_sources)) source = self.db.webhook_sources[source_id] query = f"DROP SOURCE {source}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e del self.db.webhook_sources[source_id] @@ -882,7 +921,12 @@ def run(self, exe: Executor) -> None: source_id = self.rng.randrange(len(self.db.kafka_sources)) source = self.db.kafka_sources[source_id] query = f"DROP SOURCE {source}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e del self.db.kafka_sources[source_id] @@ -920,7 +964,12 @@ def run(self, exe: Executor) -> None: source_id = self.rng.randrange(len(self.db.postgres_sources)) source = self.db.postgres_sources[source_id] query = f"DROP SOURCE {source.executor.source}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e del self.db.postgres_sources[source_id] @@ -964,7 +1013,12 @@ def run(self, exe: Executor) -> None: sink_id = self.rng.randrange(len(self.db.kafka_sinks)) sink = self.db.kafka_sinks[sink_id] query = f"DROP SINK {sink}" - exe.execute(query) + try: + exe.execute(query) + except QueryError as e: + # expected, see #20465 + if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + raise e del self.db.kafka_sinks[sink_id] From c7cc60a9eb0bf727967ccdc46df697bc260ca08f Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Mon, 23 Oct 2023 11:34:19 +0000 Subject: [PATCH 03/17] parallel-workload: Run with multiple databases --- .../materialize/parallel_workload/action.py | 555 ++++++++++-------- .../materialize/parallel_workload/database.py | 67 +-- .../materialize/parallel_workload/executor.py | 15 +- .../parallel_workload/parallel_workload.py | 125 +++- .../materialize/parallel_workload/worker.py | 60 +- test/parallel-workload/mzcompose.py | 1 + 6 files changed, 460 insertions(+), 363 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 3fe6e57386fa4..e49161133eddc 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -19,6 +19,7 @@ from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap from materialize.mzcompose.composition import Composition from materialize.parallel_workload.database import ( + DB_OFFSET, MAX_CLUSTER_REPLICAS, MAX_CLUSTERS, MAX_KAFKA_SINKS, @@ -32,7 +33,6 @@ MAX_WEBHOOK_SOURCES, Cluster, ClusterReplica, - Database, DBObject, KafkaSink, KafkaSource, @@ -52,21 +52,19 @@ # TODO: CASCADE in DROPs, keep track of what will be deleted class Action: rng: random.Random - db: Database - def __init__(self, rng: random.Random, db: Database): + def __init__(self, rng: random.Random): self.rng = rng - self.db = db def run(self, exe: Executor) -> None: raise NotImplementedError - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: result = [ "permission denied for", "must be owner of", ] - if self.db.complexity == Complexity.DDL: + if exe.db.complexity == Complexity.DDL: result.extend( [ "query could not complete", @@ -79,13 +77,13 @@ def errors_to_ignore(self) -> list[str]: "the transaction's active cluster has been dropped", # cluster was dropped ] ) - if self.db.scenario == Scenario.Cancel: + if exe.db.scenario == Scenario.Cancel: result.extend( [ "canceling statement due to user request", ] ) - if self.db.scenario == Scenario.Kill: + if exe.db.scenario == Scenario.Kill: result.extend( [ "network error", @@ -101,9 +99,9 @@ def errors_to_ignore(self) -> list[str]: class FetchAction(Action): - def errors_to_ignore(self) -> list[str]: - result = super().errors_to_ignore() - if self.db.complexity == Complexity.DDL: + def errors_to_ignore(self, exe: Executor) -> list[str]: + result = super().errors_to_ignore(exe) + if exe.db.complexity == Complexity.DDL: result.extend( [ "does not exist", @@ -112,7 +110,7 @@ def errors_to_ignore(self) -> list[str]: return result def run(self, exe: Executor) -> None: - obj = self.rng.choice(self.db.db_objects()) + obj = self.rng.choice(exe.db.db_objects()) # See https://github.com/MaterializeInc/materialize/issues/20474 exe.rollback() if self.rng.choice([True, False]) else exe.commit() query = f"DECLARE c CURSOR FOR SUBSCRIBE {obj}" @@ -129,15 +127,15 @@ def run(self, exe: Executor) -> None: class SelectAction(Action): - def errors_to_ignore(self) -> list[str]: - result = super().errors_to_ignore() - if self.db.complexity in (Complexity.DML, Complexity.DDL): + def errors_to_ignore(self, exe: Executor) -> list[str]: + result = super().errors_to_ignore(exe) + if exe.db.complexity in (Complexity.DML, Complexity.DDL): result.extend( [ "in the same timedomain", ] ) - if self.db.complexity == Complexity.DDL: + if exe.db.complexity == Complexity.DDL: result.extend( [ "does not exist", @@ -146,9 +144,9 @@ def errors_to_ignore(self) -> list[str]: return result def run(self, exe: Executor) -> None: - obj = self.rng.choice(self.db.db_objects()) + obj = self.rng.choice(exe.db.db_objects()) column = self.rng.choice(obj.columns) - obj2 = self.rng.choice(self.db.db_objects()) + obj2 = self.rng.choice(exe.db.db_objects()) obj_name = str(obj) obj2_name = str(obj2) columns = [c for c in obj2.columns if c.data_type == column.data_type] @@ -197,14 +195,14 @@ class InsertAction(Action): def run(self, exe: Executor) -> None: table = None if exe.insert_table != None: - for t in self.db.tables: + for t in exe.db.tables: if t.table_id == exe.insert_table: table = t break else: exe.commit() if self.rng.choice([True, False]) else exe.rollback() if not table: - table = self.rng.choice(self.db.tables) + table = self.rng.choice(exe.db.tables) column_names = ", ".join(column.name(True) for column in table.columns) column_values = ", ".join( @@ -215,14 +213,14 @@ def run(self, exe: Executor) -> None: return exe.execute(query) exe.insert_table = table.table_id - with self.db.lock: + with exe.db.lock: table.num_rows += 1 class SourceInsertAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - sources = self.db.kafka_sources + self.db.postgres_sources + with exe.db.lock: + sources = exe.db.kafka_sources + exe.db.postgres_sources if not sources: return source = self.rng.choice(sources) @@ -232,20 +230,20 @@ def run(self, exe: Executor) -> None: class UpdateAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "canceling statement due to statement timeout", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: table = None if exe.insert_table != None: - for t in self.db.tables: + for t in exe.db.tables: if t.table_id == exe.insert_table: table = t break if not table: - table = self.rng.choice(self.db.tables) + table = self.rng.choice(exe.db.tables) column1 = table.columns[0] column2 = self.rng.choice(table.columns) @@ -259,13 +257,13 @@ def run(self, exe: Executor) -> None: class DeleteAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "canceling statement due to statement timeout", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - table = self.rng.choice(self.db.tables) + table = self.rng.choice(exe.db.tables) query = f"DELETE FROM {table}" if self.rng.random() < 0.95: column = self.rng.choice(table.columns) @@ -282,13 +280,13 @@ def run(self, exe: Executor) -> None: # so for now have to trigger them manually here. if self.rng.choice([True, False]): exe.commit() - with self.db.lock: + with exe.db.lock: table.num_rows = 0 class CommentAction(Action): def run(self, exe: Executor) -> None: - table = self.rng.choice(self.db.tables) + table = self.rng.choice(exe.db.tables) if self.rng.choice([True, False]): column = self.rng.choice(table.columns) @@ -300,13 +298,13 @@ def run(self, exe: Executor) -> None: class CreateIndexAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "already exists", # TODO: Investigate - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - tables_views: list[DBObject] = [*self.db.tables, *self.db.views] + tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views] table = self.rng.choice(tables_views) columns = self.rng.sample(table.columns, len(table.columns)) columns_str = "_".join(column.name() for column in columns) @@ -319,83 +317,91 @@ def run(self, exe: Executor) -> None: index_str = ", ".join(index_elems) query = f"CREATE INDEX {identifier(index_name)} ON {table} ({index_str})" exe.execute(query) - with self.db.lock: - self.db.indexes.add(index_name) + with exe.db.lock: + exe.db.indexes.add(index_name) class DropIndexAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.indexes: + with exe.db.lock: + if not exe.db.indexes: return - index_name = self.rng.choice(list(self.db.indexes)) + index_name = self.rng.choice(list(exe.db.indexes)) query = f"DROP INDEX {identifier(index_name)}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - self.db.indexes.remove(index_name) + exe.db.indexes.remove(index_name) class CreateTableAction(Action): def run(self, exe: Executor) -> None: - if len(self.db.tables) > MAX_TABLES: + if len(exe.db.tables) > MAX_TABLES: return - table_id = self.db.table_id - self.db.table_id += 1 - table = Table(self.rng, table_id, self.rng.choice(self.db.schemas)) + table_id = exe.db.table_id + exe.db.table_id += 1 + table = Table(self.rng, table_id, self.rng.choice(exe.db.schemas)) table.create(exe) - self.db.tables.append(table) + exe.db.tables.append(table) class DropTableAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.tables) <= 2: + with exe.db.lock: + if len(exe.db.tables) <= 2: return - table = self.rng.choice(self.db.tables) + table = self.rng.choice(exe.db.tables) query = f"DROP TABLE {table}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - self.db.tables.remove(table) + exe.db.tables.remove(table) class RenameTableAction(Action): def run(self, exe: Executor) -> None: - if self.db.scenario != Scenario.Rename: + if exe.db.scenario != Scenario.Rename: return - with self.db.lock: - if not self.db.tables: + with exe.db.lock: + if not exe.db.tables: return - table = self.rng.choice(self.db.tables) - old_name = str(table) - table.rename += 1 - try: - exe.execute(f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}") - except: - table.rename -= 1 - raise + table = self.rng.choice(exe.db.tables) + old_name = str(table) + table.rename += 1 + try: + exe.execute( + f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}" + ) + except: + table.rename -= 1 + raise class RenameViewAction(Action): def run(self, exe: Executor) -> None: - if self.db.scenario != Scenario.Rename: + if exe.db.scenario != Scenario.Rename: return - with self.db.lock: - if not self.db.views: + with exe.db.lock: + if not exe.db.views: return - view = self.rng.choice(self.db.views) + view = self.rng.choice(exe.db.views) old_name = str(view) view.rename += 1 try: @@ -409,73 +415,75 @@ def run(self, exe: Executor) -> None: class RenameSinkAction(Action): def run(self, exe: Executor) -> None: - if self.db.scenario != Scenario.Rename: + if exe.db.scenario != Scenario.Rename: return - with self.db.lock: - if not self.db.kafka_sinks: + with exe.db.lock: + if not exe.db.kafka_sinks: return - sink = self.rng.choice(self.db.kafka_sinks) - old_name = str(sink) - sink.rename += 1 - try: - exe.execute(f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}") - except: - sink.rename -= 1 - raise + sink = self.rng.choice(exe.db.kafka_sinks) + old_name = str(sink) + sink.rename += 1 + try: + exe.execute( + f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}" + ) + except: + sink.rename -= 1 + raise class CreateSchemaAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.schemas) > MAX_SCHEMAS: + with exe.db.lock: + if len(exe.db.schemas) > MAX_SCHEMAS: return - schema_id = self.db.schema_id - self.db.schema_id += 1 + schema_id = exe.db.schema_id + exe.db.schema_id += 1 schema = Schema(self.rng, schema_id) schema.create(exe) - self.db.schemas.append(schema) + exe.db.schemas.append(schema) class DropSchemaAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "cannot be dropped without CASCADE while it contains objects", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.schemas) <= 1: + with exe.db.lock: + if len(exe.db.schemas) <= 1: return - schema_id = self.rng.randrange(len(self.db.schemas)) - schema = self.db.schemas[schema_id] + schema_id = self.rng.randrange(len(exe.db.schemas)) + schema = exe.db.schemas[schema_id] query = f"DROP SCHEMA {schema}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown schema" not in e.msg: + if exe.db.scenario != Scenario.Kill or "unknown schema" not in e.msg: raise e - del self.db.schemas[schema_id] + del exe.db.schemas[schema_id] class RenameSchemaAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "ambiguous reference to schema named" # see https://github.com/MaterializeInc/materialize/pull/22551#pullrequestreview-1691876923 - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - if self.db.scenario != Scenario.Rename: + if exe.db.scenario != Scenario.Rename: return - with self.db.lock: - schema = self.rng.choice(self.db.schemas) - old_name = str(schema) - schema.rename += 1 - try: - exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}") - except: - schema.rename -= 1 - raise + with exe.db.lock: + schema = self.rng.choice(exe.db.schemas) + old_name = str(schema) + schema.rename += 1 + try: + exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}") + except: + schema.rename -= 1 + raise class SwapSchemaAction(Action): @@ -518,15 +526,15 @@ def run(self, exe: Executor) -> None: class CreateViewAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.views) > MAX_VIEWS: + with exe.db.lock: + if len(exe.db.views) > MAX_VIEWS: return - view_id = self.db.view_id - self.db.view_id += 1 + view_id = exe.db.view_id + exe.db.view_id += 1 # Don't use views for now since LIMIT 1 and statement_timeout are # not effective yet at preventing long-running queries and OoMs. - base_object = self.rng.choice(self.db.db_objects()) - base_object2: DBObject | None = self.rng.choice(self.db.db_objects()) + base_object = self.rng.choice(exe.db.db_objects()) + base_object2: DBObject | None = self.rng.choice(exe.db.db_objects()) if self.rng.choice([True, False]) or base_object2 == base_object: base_object2 = None view = View( @@ -534,24 +542,24 @@ def run(self, exe: Executor) -> None: view_id, base_object, base_object2, - self.rng.choice(self.db.schemas), + self.rng.choice(exe.db.schemas), ) view.create(exe) - self.db.views.append(view) + exe.db.views.append(view) class DropViewAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.views: + with exe.db.lock: + if not exe.db.views: return - view_id = self.rng.randrange(len(self.db.views)) - view = self.db.views[view_id] + view_id = self.rng.randrange(len(exe.db.views)) + view = exe.db.views[view_id] if view.materialized: query = f"DROP MATERIALIZED VIEW {view}" else: @@ -560,117 +568,119 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - del self.db.views[view_id] + del exe.db.views[view_id] class CreateRoleAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.roles) > MAX_ROLES: + with exe.db.lock: + if len(exe.db.roles) > MAX_ROLES: return - role_id = self.db.role_id - self.db.role_id += 1 - role = Role(role_id) + role_id = exe.db.role_id + exe.db.role_id += 1 + role = Role(exe.db.db_id * DB_OFFSET + role_id) role.create(exe) - self.db.roles.append(role) + exe.db.roles.append(role) class DropRoleAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "cannot be dropped because some objects depend on it", "current role cannot be dropped", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.roles: + with exe.db.lock: + if not exe.db.roles: return - role_id = self.rng.randrange(len(self.db.roles)) - role = self.db.roles[role_id] + role = self.rng.choice(exe.db.roles) query = f"DROP ROLE {role}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown role" not in e.msg: + if exe.db.scenario != Scenario.Kill or "unknown role" not in e.msg: raise e - del self.db.roles[role_id] + exe.db.roles.remove(role) class CreateClusterAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.clusters) > MAX_CLUSTERS: + with exe.db.lock: + if len(exe.db.clusters) > MAX_CLUSTERS: return - cluster_id = self.db.cluster_id - self.db.cluster_id += 1 + cluster_id = exe.db.cluster_id + exe.db.cluster_id += 1 cluster = Cluster( - cluster_id, + exe.db.db_id * DB_OFFSET + cluster_id, managed=self.rng.choice([True, False]), size=self.rng.choice(["1", "2", "4"]), replication_factor=self.rng.choice([1, 2, 4, 5]), introspection_interval=self.rng.choice(["0", "1s", "10s"]), ) cluster.create(exe) - self.db.clusters.append(cluster) + exe.db.clusters.append(cluster) class DropClusterAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ # cannot drop cluster "..." because other objects depend on it "because other objects depend on it", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.clusters) <= 1: + with exe.db.lock: + if len(exe.db.clusters) <= 1: return # Keep cluster 0 with 1 replica for sources/sinks - cluster_id = self.rng.randrange(1, len(self.db.clusters)) - cluster = self.db.clusters[cluster_id] + cluster_id = self.rng.randrange(1, len(exe.db.clusters)) + cluster = exe.db.clusters[cluster_id] query = f"DROP CLUSTER {cluster}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg: + if exe.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg: raise e - del self.db.clusters[cluster_id] + del exe.db.clusters[cluster_id] class SetClusterAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "SET cluster cannot be called in an active transaction", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.clusters: + with exe.db.lock: + if not exe.db.clusters: return - cluster = self.rng.choice(self.db.clusters) + cluster = self.rng.choice(exe.db.clusters) query = f"SET CLUSTER = {cluster}" exe.execute(query) class CreateClusterReplicaAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: result = [ "cannot create more than one replica of a cluster containing sources or sinks", # Can happen with reduced locking "cannot create multiple replicas named", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) return result def run(self, exe: Executor) -> None: - with self.db.lock: + with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks - unmanaged_clusters = [c for c in self.db.clusters[1:] if not c.managed] + unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] if not unmanaged_clusters: return cluster = self.rng.choice(unmanaged_clusters) @@ -692,9 +702,9 @@ def run(self, exe: Executor) -> None: class DropClusterReplicaAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: + with exe.db.lock: # Keep cluster 0 with 1 replica for sources/sinks - unmanaged_clusters = [c for c in self.db.clusters[1:] if not c.managed] + unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed] if not unmanaged_clusters: return cluster = self.rng.choice(unmanaged_clusters) @@ -707,18 +717,22 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "has no CLUSTER REPLICA named" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "has no CLUSTER REPLICA named" not in e.msg + ): raise e cluster.replicas.remove(replica) + class GrantPrivilegesAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.roles: + with exe.db.lock: + if not exe.db.roles: return - role = self.rng.choice(self.db.roles) + role = self.rng.choice(exe.db.roles) privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) - tables_views: list[DBObject] = [*self.db.tables, *self.db.views] + tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views] table = self.rng.choice(tables_views) query = f"GRANT {privilege} ON {table} TO {role}" exe.execute(query) @@ -726,12 +740,12 @@ def run(self, exe: Executor) -> None: class RevokePrivilegesAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.roles: + with exe.db.lock: + if not exe.db.roles: return - role = self.rng.choice(self.db.roles) + role = self.rng.choice(exe.db.roles) privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"]) - tables_views: list[DBObject] = [*self.db.tables, *self.db.views] + tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views] table = self.rng.choice(tables_views) query = f"REVOKE {privilege} ON {table} FROM {role}" exe.execute(query) @@ -742,20 +756,19 @@ class ReconnectAction(Action): def __init__( self, rng: random.Random, - db: Database, random_role: bool = True, ): - super().__init__(rng, db) + super().__init__(rng) self.random_role = random_role def run(self, exe: Executor) -> None: autocommit = exe.cur._c.autocommit - host = self.db.host - port = self.db.ports["materialized"] - with self.db.lock: - if self.random_role and self.db.roles: + host = exe.db.host + port = exe.db.ports["materialized"] + with exe.db.lock: + if self.random_role and exe.db.roles: user = self.rng.choice( - ["materialize", str(self.rng.choice(self.db.roles))] + ["materialize", str(self.rng.choice(exe.db.roles))] ) else: user = "materialize" @@ -774,7 +787,7 @@ def run(self, exe: Executor) -> None: for i in range(NUM_ATTEMPTS): try: conn = pg8000.connect( - host=host, port=port, user=user, database=self.db.name() + host=host, port=port, user=user, database=exe.db.name() ) conn.autocommit = autocommit cur = conn.cursor() @@ -798,30 +811,29 @@ def run(self, exe: Executor) -> None: class CancelAction(Action): workers: list["Worker"] - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "must be a member of", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def __init__( self, rng: random.Random, - db: Database, workers: list["Worker"], ): - super().__init__(rng, db) + super().__init__(rng) self.workers = workers def run(self, exe: Executor) -> None: pid = self.rng.choice( - [worker.exe.pg_pid for worker in self.workers if worker.exe and worker.exe.pg_pid != -1] # type: ignore + [exe.pg_pid for worker in self.workers for exe in worker.exes if exe and exe.pg_pid != -1] # type: ignore ) worker = None for i in range(len(self.workers)): - worker_exe = self.workers[i].exe - if worker_exe and worker_exe.pg_pid == pid: - worker = f"worker_{i}" - break + for worker_exe in self.workers[i].exes: + if worker_exe and worker_exe.pg_pid == pid: + worker = f"worker_{i}" + break assert worker exe.execute( f"SELECT pg_cancel_backend({pid})", extra_info=f"Canceling {worker}" @@ -836,10 +848,9 @@ class KillAction(Action): def __init__( self, rng: random.Random, - db: Database, composition: Composition, ): - super().__init__(rng, db) + super().__init__(rng) self.composition = composition def run(self, exe: Executor) -> None: @@ -852,184 +863,206 @@ def run(self, exe: Executor) -> None: class CreateWebhookSourceAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.webhook_sources) > MAX_WEBHOOK_SOURCES: + with exe.db.lock: + if len(exe.db.webhook_sources) > MAX_WEBHOOK_SOURCES: return - webhook_source_id = self.db.webhook_source_id - self.db.webhook_source_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] + webhook_source_id = exe.db.webhook_source_id + exe.db.webhook_source_id += 1 + potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(self.db.schemas) + schema = self.rng.choice(exe.db.schemas) source = WebhookSource(webhook_source_id, cluster, schema, self.rng) source.create(exe) - self.db.webhook_sources.append(source) + exe.db.webhook_sources.append(source) class DropWebhookSourceAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.webhook_sources: + with exe.db.lock: + if not exe.db.webhook_sources: return - source_id = self.rng.randrange(len(self.db.webhook_sources)) - source = self.db.webhook_sources[source_id] + source_id = self.rng.randrange(len(exe.db.webhook_sources)) + source = exe.db.webhook_sources[source_id] query = f"DROP SOURCE {source}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - del self.db.webhook_sources[source_id] + del exe.db.webhook_sources[source_id] class CreateKafkaSourceAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.kafka_sources) > MAX_KAFKA_SOURCES: + with exe.db.lock: + if len(exe.db.kafka_sources) > MAX_KAFKA_SOURCES: return - source_id = self.db.kafka_source_id - self.db.kafka_source_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] + source_id = exe.db.kafka_source_id + exe.db.kafka_source_id += 1 + potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(self.db.schemas) + schema = self.rng.choice(exe.db.schemas) try: source = KafkaSource( - self.db.name(), source_id, cluster, schema, self.db.ports, self.rng + exe.db.name(), + exe.db.db_id * DB_OFFSET + source_id, + cluster, + schema, + exe.db.ports, + self.rng, ) source.create(exe) - self.db.kafka_sources.append(source) + exe.db.kafka_sources.append(source) except: - if self.db.scenario != Scenario.Kill: + if exe.db.scenario != Scenario.Kill: raise class DropKafkaSourceAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.kafka_sources: + with exe.db.lock: + if not exe.db.kafka_sources: return - source_id = self.rng.randrange(len(self.db.kafka_sources)) - source = self.db.kafka_sources[source_id] + source_id = self.rng.randrange(len(exe.db.kafka_sources)) + source = exe.db.kafka_sources[source_id] query = f"DROP SOURCE {source}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - del self.db.kafka_sources[source_id] + del exe.db.kafka_sources[source_id] class CreatePostgresSourceAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.postgres_sources) > MAX_POSTGRES_SOURCES: + with exe.db.lock: + if len(exe.db.postgres_sources) > MAX_POSTGRES_SOURCES: return - source_id = self.db.postgres_source_id - self.db.postgres_source_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] - schema = self.rng.choice(self.db.schemas) + source_id = exe.db.postgres_source_id + exe.db.postgres_source_id += 1 + potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] + schema = self.rng.choice(exe.db.schemas) cluster = self.rng.choice(potential_clusters) try: source = PostgresSource( - self.db.name(), source_id, cluster, schema, self.db.ports, self.rng + exe.db.name(), + exe.db.db_id * DB_OFFSET + source_id, + cluster, + schema, + exe.db.ports, + self.rng, ) source.create(exe) - self.db.postgres_sources.append(source) + exe.db.postgres_sources.append(source) except: - if self.db.scenario != Scenario.Kill: + if exe.db.scenario != Scenario.Kill: raise class DropPostgresSourceAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.postgres_sources: + with exe.db.lock: + if not exe.db.postgres_sources: return - source_id = self.rng.randrange(len(self.db.postgres_sources)) - source = self.db.postgres_sources[source_id] + source_id = self.rng.randrange(len(exe.db.postgres_sources)) + source = exe.db.postgres_sources[source_id] query = f"DROP SOURCE {source.executor.source}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - del self.db.postgres_sources[source_id] + del exe.db.postgres_sources[source_id] class CreateKafkaSinkAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ # Another replica can be created in parallel "cannot create sink in cluster with more than one replica", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if len(self.db.kafka_sinks) > MAX_KAFKA_SINKS: + with exe.db.lock: + if len(exe.db.kafka_sinks) > MAX_KAFKA_SINKS: return - sink_id = self.db.kafka_sink_id - self.db.kafka_sink_id += 1 - potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1] + sink_id = exe.db.kafka_sink_id + exe.db.kafka_sink_id += 1 + potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(self.db.schemas) + schema = self.rng.choice(exe.db.schemas) sink = KafkaSink( - sink_id, + exe.db.db_id * DB_OFFSET + sink_id, cluster, schema, - self.rng.choice(self.db.db_objects_without_views()), + self.rng.choice(exe.db.db_objects_without_views()), self.rng, ) sink.create(exe) - self.db.kafka_sinks.append(sink) + exe.db.kafka_sinks.append(sink) class DropKafkaSinkAction(Action): - def errors_to_ignore(self) -> list[str]: + def errors_to_ignore(self, exe: Executor) -> list[str]: return [ "still depended upon by", - ] + super().errors_to_ignore() + ] + super().errors_to_ignore(exe) def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.kafka_sinks: + with exe.db.lock: + if not exe.db.kafka_sinks: return - sink_id = self.rng.randrange(len(self.db.kafka_sinks)) - sink = self.db.kafka_sinks[sink_id] + sink_id = self.rng.randrange(len(exe.db.kafka_sinks)) + sink = exe.db.kafka_sinks[sink_id] query = f"DROP SINK {sink}" try: exe.execute(query) except QueryError as e: # expected, see #20465 - if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg: + if ( + exe.db.scenario != Scenario.Kill + or "unknown catalog item" not in e.msg + ): raise e - del self.db.kafka_sinks[sink_id] + del exe.db.kafka_sinks[sink_id] class HttpPostAction(Action): def run(self, exe: Executor) -> None: - with self.db.lock: - if not self.db.webhook_sources: + with exe.db.lock: + if not exe.db.webhook_sources: return - source = self.rng.choice(self.db.webhook_sources) - url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}" + source = self.rng.choice(exe.db.webhook_sources) + url = f"http://{exe.db.host}:{exe.db.ports['http']}/api/webhook/{exe.db}/public/{source}" payload = source.body_format.to_data_type().random_value(self.rng) @@ -1052,7 +1085,7 @@ def run(self, exe: Executor) -> None: requests.post(url, data=payload.encode("utf-8"), headers=headers) except requests.exceptions.ConnectionError: # Expeceted when Mz is killed - if self.db.scenario != Scenario.Kill: + if exe.db.scenario != Scenario.Kill: raise diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index e3e5fb8853548..3f6a39bb5e7a7 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -33,6 +33,7 @@ from materialize.util import naughty_strings MAX_COLUMNS = 100 +MAX_INCLUDE_HEADERS = 5 MAX_ROWS = 1000 MAX_CLUSTERS = 10 MAX_CLUSTER_REPLICAS = 4 @@ -44,7 +45,6 @@ MAX_KAFKA_SOURCES = 20 MAX_POSTGRES_SOURCES = 20 MAX_KAFKA_SINKS = 20 -MAX_INCLUDE_HEADERS = 5 MAX_INITIAL_SCHEMAS = 1 MAX_INITIAL_CLUSTERS = 2 @@ -56,7 +56,7 @@ MAX_INITIAL_POSTGRES_SOURCES = 3 MAX_INITIAL_KAFKA_SINKS = 3 -MAX_IDENTIFIER_LENGTH = 255 +DB_OFFSET = 1_000_000 NAUGHTY_IDENTIFIERS = False @@ -654,6 +654,7 @@ def create(self, exe: Executor) -> None: class Database: + db_id: int seed: str complexity: Complexity scenario: Scenario @@ -682,6 +683,7 @@ class Database: def __init__( self, + db_id: int, rng: random.Random, seed: str, host: str, @@ -691,6 +693,7 @@ def __init__( naughty_identifiers: bool, ): global NAUGHTY_IDENTIFIERS + self.db_id = db_id self.seed = seed self.host = host self.ports = ports @@ -718,12 +721,15 @@ def __init__( view = View(rng, i, base_object, base_object2, rng.choice(self.schemas)) self.views.append(view) self.view_id = len(self.views) - self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))] + self.roles = [ + Role(db_id * DB_OFFSET + i) + for i in range(rng.randint(0, MAX_INITIAL_ROLES)) + ] self.role_id = len(self.roles) # At least one storage cluster required for WebhookSources self.clusters = [ Cluster( - i, + self.db_id * DB_OFFSET + i, managed=rng.choice([True, False]), size=rng.choice(["1", "2", "4"]), replication_factor=1, @@ -741,7 +747,7 @@ def __init__( self.kafka_sources = [ KafkaSource( self.name(), - i, + self.db_id * DB_OFFSET + i, rng.choice(self.clusters), rng.choice(self.schemas), ports, @@ -753,7 +759,7 @@ def __init__( self.postgres_sources = [ PostgresSource( self.name(), - i, + self.db_id * DB_OFFSET + i, rng.choice(self.clusters), rng.choice(self.schemas), ports, @@ -776,7 +782,7 @@ def __init__( self.lock = threading.Lock() def name(self) -> str: - return naughtify(f"db-pw-{self.seed}") + return naughtify(f"db-pw-{self.seed}-{self.db_id}") def __str__(self) -> str: return identifier(self.name()) @@ -810,48 +816,19 @@ def drop(self, exe: Executor) -> None: def create(self, exe: Executor) -> None: self.drop(exe) - exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true") - exe.execute("ALTER SYSTEM SET max_schemas_per_database = 105") - # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES - exe.execute("ALTER SYSTEM SET max_tables = 200") - exe.execute("ALTER SYSTEM SET max_materialized_views = 105") - exe.execute("ALTER SYSTEM SET max_sources = 105") - exe.execute("ALTER SYSTEM SET max_roles = 105") - exe.execute("ALTER SYSTEM SET max_clusters = 105") - exe.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105") - # Most queries should not fail because of privileges - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC" - ) - exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC" - ) exe.execute(f"CREATE DATABASE {self}") exe.execute(f"ALTER DATABASE {self} OWNER TO materialize") def create_relations(self, exe: Executor) -> None: - exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") - for row in exe.cur.fetchall(): - exe.execute(f"DROP CLUSTER {row[0]} CASCADE") - - exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") - for row in exe.cur.fetchall(): - exe.execute(f"DROP ROLE {row[0]}") + # Roles and clusters are system wide, not per DB + if self.db_id == 0: + exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE") + + exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP ROLE {identifier(row[0])}") exe.execute("CREATE CONNECTION kafka_conn FOR KAFKA BROKER 'kafka:9092'") exe.execute( diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py index 592111fbf0cf2..27cd1f4613b73 100644 --- a/misc/python/materialize/parallel_workload/executor.py +++ b/misc/python/materialize/parallel_workload/executor.py @@ -9,10 +9,13 @@ import random import threading -from typing import TextIO +from typing import TYPE_CHECKING, TextIO import pg8000 +if TYPE_CHECKING: + from materialize.parallel_workload.database import Database + logging: TextIO | None lock: threading.Lock @@ -38,12 +41,18 @@ class Executor: pg_pid: int # Used by INSERT action to prevent writing into different tables in the same transaction insert_table: int | None + db: "Database" + reconnect_next: bool + rollback_next: bool - def __init__(self, rng: random.Random, cur: pg8000.Cursor): + def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"): self.rng = rng self.cur = cur + self.db = db self.pg_pid = -1 self.insert_table = None + self.reconnect_next = True + self.rollback_next = True def set_isolation(self, level: str) -> None: self.execute(f"SET TRANSACTION_ISOLATION TO '{level}'") @@ -71,7 +80,7 @@ def log(self, msg: str) -> None: thread_name = threading.current_thread().getName() with lock: - print(f"[{thread_name}] {msg}", file=logging) + print(f"[{thread_name}][{self.db.name()}] {msg}", file=logging) logging.flush() def execute( diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 0895e107aaa23..35d99b0cffa9f 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -30,7 +30,19 @@ read_action_list, write_action_list, ) -from materialize.parallel_workload.database import Database +from materialize.parallel_workload.database import ( + MAX_CLUSTER_REPLICAS, + MAX_CLUSTERS, + MAX_KAFKA_SINKS, + MAX_KAFKA_SOURCES, + MAX_POSTGRES_SOURCES, + MAX_ROLES, + MAX_SCHEMAS, + MAX_TABLES, + MAX_VIEWS, + MAX_WEBHOOK_SOURCES, + Database, +) from materialize.parallel_workload.executor import Executor, initialize_logging from materialize.parallel_workload.settings import Complexity, Scenario from materialize.parallel_workload.worker import Worker @@ -48,13 +60,14 @@ def run( scenario: Scenario, num_threads: int | None, naughty_identifiers: bool, + num_databases: int, composition: Composition | None, ) -> None: num_threads = num_threads or os.cpu_count() or 10 random.seed(seed) print( - f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} (--host={host})" + f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} --databases={num_databases} (--host={host})" ) initialize_logging() @@ -63,28 +76,79 @@ def run( ).timestamp() rng = random.Random(random.randrange(SEED_RANGE)) - database = Database( - rng, seed, host, ports, complexity, scenario, naughty_identifiers - ) + databases = [ + Database(i, rng, seed, host, ports, complexity, scenario, naughty_identifiers) + for i in range(num_databases) + ] system_conn = pg8000.connect( host=host, port=ports["mz_system"], user="mz_system", database="materialize" ) system_conn.autocommit = True - with system_conn.cursor() as cur: - database.create(Executor(rng, cur)) - system_conn.close() + with system_conn.cursor() as system_cur: + system_exe = Executor(rng, system_cur, databases[0]) + system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true") + system_exe.execute( + f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS + 5}" + ) + # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES + system_exe.execute( + f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS + 5}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS + 5}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES + 5}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS + 5}" + ) + system_exe.execute( + f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS + 5}" + ) + # Most queries should not fail because of privileges + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC" + ) + system_exe.execute( + "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC" + ) + for database in databases: + database.create(system_exe) - conn = pg8000.connect( - host=host, - port=ports["materialized"], - user="materialize", - database=database.name(), - ) - conn.autocommit = True - with conn.cursor() as cur: - database.create_relations(Executor(rng, cur)) - conn.close() + conn = pg8000.connect( + host=host, + port=ports["materialized"], + user="materialize", + database=database.name(), + ) + conn.autocommit = True + with conn.cursor() as cur: + database.create_relations(Executor(rng, cur, database)) + conn.close() + system_conn.close() workers = [] threads = [] @@ -110,8 +174,7 @@ def run( weights, )[0] actions = [ - action_class(worker_rng, database) - for action_class in action_list.action_classes + action_class(worker_rng) for action_class in action_list.action_classes ] worker = Worker( worker_rng, @@ -130,7 +193,7 @@ def run( thread = threading.Thread( name=thread_name, target=worker.run, - args=(host, ports["materialized"], "materialize", database.name()), + args=(host, ports["materialized"], "materialize", databases), ) thread.start() threads.append(thread) @@ -138,7 +201,7 @@ def run( if scenario == Scenario.Cancel: worker = Worker( worker_rng, - [CancelAction(worker_rng, database, workers)], + [CancelAction(worker_rng, workers)], [1], end_time, autocommit=False, @@ -148,7 +211,7 @@ def run( thread = threading.Thread( name="cancel", target=worker.run, - args=(host, ports["mz_system"], "mz_system", str(database)), + args=(host, ports["mz_system"], "mz_system", databases), ) thread.start() threads.append(thread) @@ -156,7 +219,7 @@ def run( assert composition, "Kill scenario only works in mzcompose" worker = Worker( worker_rng, - [KillAction(worker_rng, database, composition)], + [KillAction(worker_rng, composition)], [1], end_time, autocommit=False, @@ -166,7 +229,7 @@ def run( thread = threading.Thread( name="kill", target=worker.run, - args=(host, ports["materialized"], "materialize", str(database)), + args=(host, ports["materialized"], "materialize", databases), ) thread.start() threads.append(thread) @@ -204,8 +267,9 @@ def run( conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") conn.autocommit = True with conn.cursor() as cur: - print(f"Dropping database {database}") - database.drop(Executor(rng, cur)) + for database in databases: + print(f"Dropping database {database}") + database.drop(Executor(rng, cur, database)) conn.close() ignored_errors: defaultdict[str, Counter[type[Action]]] = defaultdict(Counter) @@ -253,6 +317,12 @@ def parse_common_args(parser: argparse.ArgumentParser) -> None: action="store_true", help="Whether to use naughty strings as identifiers, makes the queries unreadable", ) + parser.add_argument( + "--databases", + default=2, + type=int, + help="Number of databases to create and run against, 2 by default", + ) def main() -> int: @@ -301,6 +371,7 @@ def main() -> int: Scenario(args.scenario), args.threads, args.naughty_identifiers, + args.databases, composition=None, # only works in mzcompose ) return 0 diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py index 2114800f20352..167683db97777 100644 --- a/misc/python/materialize/parallel_workload/worker.py +++ b/misc/python/materialize/parallel_workload/worker.py @@ -15,6 +15,7 @@ import pg8000 from materialize.parallel_workload.action import Action, ReconnectAction +from materialize.parallel_workload.database import Database from materialize.parallel_workload.executor import Executor, QueryError @@ -26,7 +27,7 @@ class Worker: num_queries: int autocommit: bool system: bool - exe: Executor | None + exes: list[Executor] ignored_errors: defaultdict[str, Counter[type[Action]]] def __init__( @@ -46,25 +47,30 @@ def __init__( self.autocommit = autocommit self.system = system self.ignored_errors = defaultdict(Counter) - self.exe = None + self.exes = [] + + def run(self, host: str, port: int, user: str, databases: list[Database]) -> None: + self.conns = [ + pg8000.connect(host=host, port=port, user=user, database=database.name()) + for database in databases + ] + for database, conn in zip(databases, self.conns): + conn.autocommit = self.autocommit + cur = conn.cursor() + exe = Executor(self.rng, cur, database) + exe.set_isolation("SERIALIZABLE") + cur.execute("SELECT pg_backend_pid()") + exe.pg_pid = cur.fetchall()[0][0] + self.exes.append(exe) - def run(self, host: str, port: int, user: str, database: str) -> None: - self.conn = pg8000.connect(host=host, port=port, user=user, database=database) - self.conn.autocommit = self.autocommit - cur = self.conn.cursor() - self.exe = Executor(self.rng, cur) - self.exe.set_isolation("SERIALIZABLE") - cur.execute("SELECT pg_backend_pid()") - self.exe.pg_pid = cur.fetchall()[0][0] - rollback_next = True - reconnect_next = True while time.time() < self.end_time: + exe = self.rng.choice(self.exes) action = self.rng.choices(self.actions, self.weights)[0] self.num_queries += 1 try: - if rollback_next: + if exe.rollback_next: try: - self.exe.rollback() + exe.rollback() except QueryError as e: if ( "Please disconnect and re-connect" in e.msg @@ -72,18 +78,16 @@ def run(self, host: str, port: int, user: str, database: str) -> None: or "Can't create a connection to host" in e.msg or "Connection refused" in e.msg ): - reconnect_next = True - rollback_next = False + exe.reconnect_next = True + exe.rollback_next = False continue - rollback_next = False - if reconnect_next: - ReconnectAction(self.rng, action.db, random_role=False).run( - self.exe - ) - reconnect_next = False - action.run(self.exe) + exe.rollback_next = False + if exe.reconnect_next: + ReconnectAction(self.rng, random_role=False).run(exe) + exe.reconnect_next = False + action.run(exe) except QueryError as e: - for error in action.errors_to_ignore(): + for error in action.errors_to_ignore(exe): if error in e.msg: self.ignored_errors[error][type(action)] += 1 if ( @@ -92,11 +96,13 @@ def run(self, host: str, port: int, user: str, database: str) -> None: or "Can't create a connection to host" in e.msg or "Connection refused" in e.msg ): - reconnect_next = True + exe.reconnect_next = True else: - rollback_next = True + exe.rollback_next = True break else: thread_name = threading.current_thread().getName() - print(f"{thread_name} Query failed: {e.query} {e.msg}") + print( + f"[{thread_name}][{exe.db.name()}] Query failed: {e.query} {e.msg}" + ) raise diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py index bfc0a4a562e4e..13c89a7b45007 100644 --- a/test/parallel-workload/mzcompose.py +++ b/test/parallel-workload/mzcompose.py @@ -68,6 +68,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: Scenario(args.scenario), args.threads, args.naughty_identifiers, + args.databases, c, ) # TODO: Only ignore errors that will be handled by parallel-workload, not others From a19593da6ef814d019f13d43ccec9fbc3cf8b75e Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Mon, 23 Oct 2023 17:09:30 +0000 Subject: [PATCH 04/17] parallel-workload: Workaround for 21954 --- ci/nightly/pipeline.template.yml | 16 +++++----- .../materialize/parallel_workload/action.py | 19 ++++++++++++ .../parallel_workload/parallel_workload.py | 30 +++++++++++++++---- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index ba7813703d038..eeb7750f6866d 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -797,7 +797,7 @@ steps: - id: parallel-workload-dml label: "Parallel Workload (DML)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -808,7 +808,7 @@ steps: - id: parallel-workload-ddl label: "Parallel Workload (DDL)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -819,7 +819,7 @@ steps: - id: parallel-workload-100-threads label: "Parallel Workload (100 threads)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -831,7 +831,7 @@ steps: - id: parallel-workload-rename-naughty label: "Parallel Workload (rename + naughty identifiers)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -842,7 +842,7 @@ steps: - id: parallel-workload-rename label: "Parallel Workload (rename)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -853,7 +853,7 @@ steps: - id: parallel-workload-cancel label: "Parallel Workload (cancel)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -864,7 +864,7 @@ steps: - id: parallel-workload-kill label: "Parallel Workload (kill)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: @@ -875,7 +875,7 @@ steps: - id: parallel-workload-backup-restore label: "Parallel Workload (backup & restore)" artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst] - timeout_in_minutes: 40 + timeout_in_minutes: 60 agents: queue: builder-linux-x86_64 plugins: diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index e49161133eddc..045b745a642cb 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -63,6 +63,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]: result = [ "permission denied for", "must be owner of", + "network error", # #21954, remove when fixed when fixed ] if exe.db.complexity == Complexity.DDL: result.extend( @@ -861,6 +862,24 @@ def run(self, exe: Executor) -> None: time.sleep(self.rng.uniform(20, 180)) +class BackupRestoreAction(Action): + composition: Composition + exes: list[Executor] + + def __init__( + self, + rng: random.Random, + composition: Composition, + exes: list[Executor]) -> None: + super().__init__(rng) + self.composition = composition + self.exes = exes + + def run(self, exe: Executor) -> None: + time.sleep(self.rng.uniform(10, 120)) + # TODO: Backup & restore here + + class CreateWebhookSourceAction(Action): def run(self, exe: Executor) -> None: with exe.db.lock: diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 35d99b0cffa9f..8734a5b77da64 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -89,29 +89,29 @@ def run( system_exe = Executor(rng, system_cur, databases[0]) system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true") system_exe.execute( - f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS + 5}" + f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 2}" ) # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES system_exe.execute( f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}" ) system_exe.execute( - f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS + 5}" + f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS * 2}" ) system_exe.execute( f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}" ) system_exe.execute( - f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS + 5}" + f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS * 2}" ) system_exe.execute( - f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES + 5}" + f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES * 2}" ) system_exe.execute( - f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS + 5}" + f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS * 2}" ) system_exe.execute( - f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS + 5}" + f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}" ) # Most queries should not fail because of privileges system_exe.execute( @@ -233,6 +233,24 @@ def run( ) thread.start() threads.append(thread) + elif scenario == Scenario.BackupRestore: + assert composition, "Backup & Restore scenario only works in mzcompose" + worker = Worker( + worker_rng, + [BackupRestoreAction(worker_rng, composition, exes)], + [1], + end_time, + autocommit=False, + system=False, + ) + workers.append(worker) + thread = threading.Thread( + name="kill", + target=worker.run, + args=(host, ports["materialized"], "materialize", databases), + ) + thread.start() + threads.append(thread) elif scenario in (Scenario.Regression, Scenario.Rename): pass else: From 5a549edeee06ac09790c72d05c6cd6ebca610abb Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 25 Oct 2023 13:07:29 +0000 Subject: [PATCH 05/17] parallel-workload: Disable Set cluster in transactions --- misc/python/materialize/parallel_workload/action.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 045b745a642cb..29636a41a8d2b 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -92,7 +92,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]: "Connection refused", ] ) - if self.db.scenario == Scenario.Rename: + if exe.db.scenario == Scenario.Rename: result.extend(["unknown schema", "ambiguous reference to schema name"]) if materialize.parallel_workload.database.NAUGHTY_IDENTIFIERS: result.extend(["identifier length exceeds 255 bytes"]) @@ -1124,7 +1124,7 @@ def __init__( read_action_list = ActionList( [ (SelectAction, 100), - (SetClusterAction, 1), + # (SetClusterAction, 1), # SET cluster cannot be called in an active transaction (CommitRollbackAction, 1), (ReconnectAction, 1), ], @@ -1134,7 +1134,7 @@ def __init__( fetch_action_list = ActionList( [ (FetchAction, 30), - (SetClusterAction, 1), + # (SetClusterAction, 1), # SET cluster cannot be called in an active transaction (ReconnectAction, 1), ], autocommit=False, @@ -1143,7 +1143,7 @@ def __init__( write_action_list = ActionList( [ (InsertAction, 100), - (SetClusterAction, 1), + # (SetClusterAction, 1), # SET cluster cannot be called in an active transaction (HttpPostAction, 50), (CommitRollbackAction, 1), (ReconnectAction, 1), From 47a8e368a84d079d7b29c14b60bc3d1d4ec12c69 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 25 Oct 2023 15:39:50 +0000 Subject: [PATCH 06/17] parallel-workload: Add backup&restore scenario --- ci/nightly/pipeline.template.yml | 2 +- .../materialize/parallel_workload/action.py | 67 ++++++++++++++++--- .../parallel_workload/parallel_workload.py | 3 +- test/parallel-workload/mzcompose.py | 22 ++++++ 4 files changed, 83 insertions(+), 11 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index eeb7750f6866d..4b418ff7b9552 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -881,7 +881,7 @@ steps: plugins: - ./ci/plugins/mzcompose: composition: parallel-workload - args: [--runtime=1500, --scenario=backup-restore] + args: [--runtime=1500, --scenario=backup-restore, --naughty-identifiers] - id: incident-70 label: "Test for incident 70" diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 29636a41a8d2b..928f07ce6209b 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -18,6 +18,7 @@ import materialize.parallel_workload.database from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap from materialize.mzcompose.composition import Composition +from materialize.mzcompose.services.minio import MINIO_BLOB_URI from materialize.parallel_workload.database import ( DB_OFFSET, MAX_CLUSTER_REPLICAS, @@ -33,6 +34,7 @@ MAX_WEBHOOK_SOURCES, Cluster, ClusterReplica, + Database, DBObject, KafkaSink, KafkaSource, @@ -859,25 +861,72 @@ def run(self, exe: Executor) -> None: # Otherwise getting failure on "up" locally time.sleep(1) self.composition.up("materialized", detach=True) - time.sleep(self.rng.uniform(20, 180)) + time.sleep(self.rng.uniform(120, 240)) +# TODO: Don't restore immediately, keep copy Database objects class BackupRestoreAction(Action): composition: Composition - exes: list[Executor] + databases: list[Database] + num: int def __init__( - self, - rng: random.Random, - composition: Composition, - exes: list[Executor]) -> None: + self, rng: random.Random, composition: Composition, databases: list[Database] + ) -> None: super().__init__(rng) self.composition = composition - self.exes = exes + self.databases = databases + self.num = 0 def run(self, exe: Executor) -> None: - time.sleep(self.rng.uniform(10, 120)) - # TODO: Backup & restore here + self.num += 1 + time.sleep(self.rng.uniform(10, 240)) + for db in self.databases: + db.lock.acquire() + + try: + # Backup + self.composition.exec("mc", "mc", "mb", f"persist/crdb-backup{self.num}") + self.composition.exec( + "cockroach", + "cockroach", + "sql", + "--insecure", + "-e", + f""" + CREATE EXTERNAL CONNECTION backup_bucket{self.num} AS 's3://persist/crdb-backup{self.num}?AWS_ENDPOINT=http://minio:9000/&AWS_REGION=minio&AWS_ACCESS_KEY_ID=minioadmin&AWS_SECRET_ACCESS_KEY=minioadmin'; + BACKUP INTO 'external://backup_bucket{self.num}'; + """, + ) + self.composition.kill("materialized") + + # Restore + self.composition.exec( + "cockroach", + "cockroach", + "sql", + "--insecure", + "-e", + f""" + DROP DATABASE defaultdb; + RESTORE DATABASE defaultdb FROM LATEST IN 'external://backup_bucket{self.num}'; + SELECT shard, min(sequence_number), max(sequence_number) + FROM consensus.consensus GROUP BY 1 ORDER BY 2 DESC, 3 DESC, 1 ASC LIMIT 32; + """, + ) + self.composition.run( + "persistcli", + "admin", + "--commit", + "restore-blob", + f"--blob-uri={MINIO_BLOB_URI}", + "--consensus-uri=postgres://root@cockroach:26257?options=--search_path=consensus", + ) + self.composition.up("materialized") + + finally: + for db in self.databases: + db.lock.release() class CreateWebhookSourceAction(Action): diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 8734a5b77da64..16539d1d9a845 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -22,6 +22,7 @@ from materialize.mzcompose.composition import Composition from materialize.parallel_workload.action import ( Action, + BackupRestoreAction, CancelAction, KillAction, ddl_action_list, @@ -237,7 +238,7 @@ def run( assert composition, "Backup & Restore scenario only works in mzcompose" worker = Worker( worker_rng, - [BackupRestoreAction(worker_rng, composition, exes)], + [BackupRestoreAction(worker_rng, composition, databases)], [1], end_time, autocommit=False, diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py index 13c89a7b45007..c4ee9f7b5974f 100644 --- a/test/parallel-workload/mzcompose.py +++ b/test/parallel-workload/mzcompose.py @@ -9,9 +9,11 @@ from materialize.mzcompose.composition import Composition, WorkflowArgumentParser +from materialize.mzcompose.service import Service from materialize.mzcompose.services.cockroach import Cockroach from materialize.mzcompose.services.kafka import Kafka from materialize.mzcompose.services.materialized import Materialized +from materialize.mzcompose.services.minio import Mc, Minio from materialize.mzcompose.services.postgres import Postgres from materialize.mzcompose.services.schema_registry import SchemaRegistry from materialize.mzcompose.services.zookeeper import Zookeeper @@ -32,11 +34,18 @@ ], ), SchemaRegistry(), + Minio(setup_materialize=True), + Mc(), Materialized( external_cockroach=True, restart="on-failure", + external_minio=True, ports=["6975:6875", "6976:6876", "6977:6877"], ), + Service( + name="persistcli", + config={"mzbuild": "jobs"}, + ), ] @@ -51,9 +60,22 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: "zookeeper", "kafka", "schema-registry", + "minio", "materialized", ] c.up(*service_names) + c.up("mc", persistent=True) + c.exec( + "mc", + "mc", + "alias", + "set", + "persist", + "http://minio:9000/", + "minioadmin", + "minioadmin", + ) + c.exec("mc", "mc", "version", "enable", "persist/persist") ports = {s: c.default_port(s) for s in service_names} ports["http"] = c.port("materialized", 6876) From a7413be3e45098e4ff4e1c8a8f03f8740d483bc9 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 25 Oct 2023 22:55:08 +0000 Subject: [PATCH 07/17] parallel-workload: Reenable NULLs after 21937 has been fixed --- misc/python/materialize/data_ingest/data_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/python/materialize/data_ingest/data_type.py b/misc/python/materialize/data_ingest/data_type.py index c62cffeb7510c..69e0963fd2e97 100644 --- a/misc/python/materialize/data_ingest/data_type.py +++ b/misc/python/materialize/data_ingest/data_type.py @@ -231,7 +231,7 @@ def random_value( if rng.randrange(10) == 0: result = rng.choice( [ - # "NULL", # TODO: Reenable after #21937 is fixed + "NULL", "0.0", "True", # "", From c7504fa250bb8b5429b16cd578ec20661df5644a Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 26 Oct 2023 08:40:47 +0000 Subject: [PATCH 08/17] data-ingest: Use QueryError too --- misc/python/materialize/data_ingest/executor.py | 5 +++-- .../materialize/data_ingest/query_error.py | 17 +++++++++++++++++ .../materialize/parallel_workload/action.py | 3 ++- .../materialize/parallel_workload/executor.py | 11 ++--------- .../materialize/parallel_workload/worker.py | 3 ++- 5 files changed, 26 insertions(+), 13 deletions(-) create mode 100644 misc/python/materialize/data_ingest/query_error.py diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py index d72a1eb668862..ac9dc5758f89c 100644 --- a/misc/python/materialize/data_ingest/executor.py +++ b/misc/python/materialize/data_ingest/executor.py @@ -25,6 +25,7 @@ from materialize.data_ingest.data_type import Backend from materialize.data_ingest.field import Field, formatted_value +from materialize.data_ingest.query_error import QueryError from materialize.data_ingest.row import Operation from materialize.data_ingest.transaction import Transaction @@ -76,9 +77,9 @@ def execute(self, cur: pg8000.Cursor, query: str) -> None: self.reconnect() with self.mz_conn.cursor() as cur: self.execute(cur, query) - except: + except Exception as e: print(f"Query failed: {query}") - raise + raise QueryError(str(e), query) def execute_with_retry_on_error( self, diff --git a/misc/python/materialize/data_ingest/query_error.py b/misc/python/materialize/data_ingest/query_error.py new file mode 100644 index 0000000000000..713ef31460dcd --- /dev/null +++ b/misc/python/materialize/data_ingest/query_error.py @@ -0,0 +1,17 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + + +class QueryError(Exception): + msg: str + query: str + + def __init__(self, msg: str, query: str): + self.msg = msg + self.query = query diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 928f07ce6209b..8ad51ae7260de 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -17,6 +17,7 @@ import materialize.parallel_workload.database from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap +from materialize.data_ingest.query_error import QueryError from materialize.mzcompose.composition import Composition from materialize.mzcompose.services.minio import MINIO_BLOB_URI from materialize.parallel_workload.database import ( @@ -45,7 +46,7 @@ View, WebhookSource, ) -from materialize.parallel_workload.executor import Executor, QueryError +from materialize.parallel_workload.executor import Executor from materialize.parallel_workload.settings import Complexity, Scenario if TYPE_CHECKING: diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py index 27cd1f4613b73..1159c0bba159c 100644 --- a/misc/python/materialize/parallel_workload/executor.py +++ b/misc/python/materialize/parallel_workload/executor.py @@ -13,6 +13,8 @@ import pg8000 +from materialize.data_ingest.query_error import QueryError + if TYPE_CHECKING: from materialize.parallel_workload.database import Database @@ -26,15 +28,6 @@ def initialize_logging() -> None: lock = threading.Lock() -class QueryError(Exception): - msg: str - query: str - - def __init__(self, msg: str, query: str): - self.msg = msg - self.query = query - - class Executor: rng: random.Random cur: pg8000.Cursor diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py index 167683db97777..92eecd72fc8cc 100644 --- a/misc/python/materialize/parallel_workload/worker.py +++ b/misc/python/materialize/parallel_workload/worker.py @@ -14,9 +14,10 @@ import pg8000 +from materialize.data_ingest.query_error import QueryError from materialize.parallel_workload.action import Action, ReconnectAction from materialize.parallel_workload.database import Database -from materialize.parallel_workload.executor import Executor, QueryError +from materialize.parallel_workload.executor import Executor class Worker: From 3bddb002bdd210516ca567644836c259cd749b74 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 26 Oct 2023 08:43:36 +0000 Subject: [PATCH 09/17] parallel-workload: Try reenabling 100 threads --- ci/nightly/pipeline.template.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 4b418ff7b9552..a06efbb4b07ac 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -826,7 +826,6 @@ steps: - ./ci/plugins/mzcompose: composition: parallel-workload args: [--runtime=1500, --threads=100] - skip: "TODO(def-): Reenable when #21954 is fixed" - id: parallel-workload-rename-naughty label: "Parallel Workload (rename + naughty identifiers)" From cc63474f4d3fe3545b5b33b8fe2e46c24c667390 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 26 Oct 2023 10:19:08 +0000 Subject: [PATCH 10/17] Fix connection errors during backup&restore --- misc/python/materialize/parallel_workload/action.py | 6 +++--- .../materialize/parallel_workload/parallel_workload.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 8ad51ae7260de..0ead98a123aa9 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -1152,9 +1152,9 @@ def run(self, exe: Executor) -> None: ) try: requests.post(url, data=payload.encode("utf-8"), headers=headers) - except requests.exceptions.ConnectionError: - # Expeceted when Mz is killed - if exe.db.scenario != Scenario.Kill: + except (requests.exceptions.ConnectionError): + # Expected when Mz is killed + if exe.db.scenario not in (Scenario.Kill, Scenario.BackupRestore): raise diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 16539d1d9a845..30f3fb5652eaf 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -68,7 +68,7 @@ def run( random.seed(seed) print( - f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} --databases={num_databases} (--host={host})" + f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}--databases={num_databases} (--host={host})" ) initialize_logging() From 7ff51d7222c21258577e4af1c94a88697f4dbef7 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 26 Oct 2023 13:07:06 +0000 Subject: [PATCH 11/17] parallel-workload: Try to reduce memory usage to prevent OoM --- .../materialize/parallel_workload/action.py | 7 ++++-- .../materialize/parallel_workload/database.py | 24 ++++++++++++------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 0ead98a123aa9..895afa35f1ad0 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -217,8 +217,7 @@ def run(self, exe: Executor) -> None: return exe.execute(query) exe.insert_table = table.table_id - with exe.db.lock: - table.num_rows += 1 + table.num_rows += 1 class SourceInsertAction(Action): @@ -230,6 +229,9 @@ def run(self, exe: Executor) -> None: source = self.rng.choice(sources) with source.lock: transaction = next(source.generator) + source.num_rows += sum( + [len(row_list.rows) for row_list in transaction.row_lists] + ) source.executor.run(transaction) @@ -1151,6 +1153,7 @@ def run(self, exe: Executor) -> None: f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}" ) try: + source.num_rows += 1 requests.post(url, data=payload.encode("utf-8"), headers=headers) except (requests.exceptions.ConnectionError): # Expected when Mz is killed diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index 3f6a39bb5e7a7..50d18dc62f479 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -35,16 +35,16 @@ MAX_COLUMNS = 100 MAX_INCLUDE_HEADERS = 5 MAX_ROWS = 1000 -MAX_CLUSTERS = 10 -MAX_CLUSTER_REPLICAS = 4 +MAX_CLUSTERS = 5 +MAX_CLUSTER_REPLICAS = 3 MAX_SCHEMAS = 10 -MAX_TABLES = 100 -MAX_VIEWS = 100 -MAX_ROLES = 100 -MAX_WEBHOOK_SOURCES = 20 -MAX_KAFKA_SOURCES = 20 -MAX_POSTGRES_SOURCES = 20 -MAX_KAFKA_SINKS = 20 +MAX_TABLES = 50 +MAX_VIEWS = 50 +MAX_ROLES = 50 +MAX_WEBHOOK_SOURCES = 10 +MAX_KAFKA_SOURCES = 10 +MAX_POSTGRES_SOURCES = 10 +MAX_KAFKA_SINKS = 10 MAX_INITIAL_SCHEMAS = 1 MAX_INITIAL_CLUSTERS = 2 @@ -329,6 +329,7 @@ class WebhookSource(DBObject): explicit_include_headers: list[str] check: str | None schema: Schema + num_rows: int def __init__( self, source_id: int, cluster: "Cluster", schema: Schema, rng: random.Random @@ -340,6 +341,7 @@ def __init__( self.body_format = rng.choice([e for e in BodyFormat]) self.include_headers = rng.choice([True, False]) self.explicit_include_headers = [] + self.num_rows = 0 self.columns = [ WebhookColumn( "body", @@ -409,6 +411,7 @@ class KafkaSource(DBObject): lock: threading.Lock columns: list[KafkaColumn] schema: Schema + num_rows: int def __init__( self, @@ -422,6 +425,7 @@ def __init__( self.source_id = source_id self.cluster = cluster self.schema = schema + self.num_rows = 0 fields = [] for i in range(rng.randint(1, 10)): fields.append( @@ -525,6 +529,7 @@ class PostgresSource(DBObject): lock: threading.Lock columns: list[PostgresColumn] schema: Schema + num_rows: int def __init__( self, @@ -538,6 +543,7 @@ def __init__( self.source_id = source_id self.cluster = cluster self.schema = schema + self.num_rows = 0 fields = [] for i in range(rng.randint(1, 10)): fields.append( From 420c6a0799a69b330aabbdcc71c1dcad65acab65 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Thu, 26 Oct 2023 17:42:23 +0000 Subject: [PATCH 12/17] Multiple dbs in same connection --- .../materialize/data_ingest/executor.py | 22 ++-- .../materialize/parallel_workload/action.py | 70 +++++++--- .../materialize/parallel_workload/database.py | 123 ++++++++++-------- .../materialize/parallel_workload/executor.py | 2 +- .../parallel_workload/parallel_workload.py | 83 +++++------- .../materialize/parallel_workload/worker.py | 56 ++++---- test/parallel-workload/mzcompose.py | 1 - 7 files changed, 183 insertions(+), 174 deletions(-) diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py index ac9dc5758f89c..742619fae4543 100644 --- a/misc/python/materialize/data_ingest/executor.py +++ b/misc/python/materialize/data_ingest/executor.py @@ -218,10 +218,10 @@ def create(self) -> None: with self.mz_conn.cursor() as cur: self.execute( cur, - f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.table)} - FROM KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}') + f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table)} + FROM KAFKA CONNECTION materialize.public.kafka_conn (TOPIC '{self.topic}') FORMAT AVRO - USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn + USING CONFLUENT SCHEMA REGISTRY CONNECTION materialize.public.csr_conn ENVELOPE UPSERT""", ) self.mz_conn.autocommit = False @@ -334,7 +334,7 @@ def create(self) -> None: ) self.execute( cur, - f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.source)} + f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.source)} FROM POSTGRES CONNECTION pg{self.num} (PUBLICATION 'postgres_source') FOR TABLES ({identifier(self.table)} AS {identifier(self.table)})""", ) @@ -426,13 +426,13 @@ def create(self) -> None: self.execute(cur, f"DROP TABLE IF EXISTS {identifier(self.table_original)}") self.execute( cur, - f"""CREATE TABLE {identifier(self.schema)}.{identifier(self.table_original)} ( + f"""CREATE TABLE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} ( {", ".join(values)}, PRIMARY KEY ({", ".join(keys)}));""", ) self.execute( cur, - f"""CREATE SINK {identifier(self.schema)}.sink{self.num} FROM {identifier(self.table_original)} + f"""CREATE SINK {identifier(self.database)}.{identifier(self.schema)}.sink{self.num} FROM {identifier(self.table_original)} INTO KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}') KEY ({", ".join([identifier(key) for key in keys])}) FORMAT AVRO @@ -441,7 +441,7 @@ def create(self) -> None: ) self.execute_with_retry_on_error( cur, - f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.table)} + f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table)} FROM KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}') FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn @@ -467,7 +467,7 @@ def run(self, transaction: Transaction) -> None: ) self.execute( cur, - f"""INSERT INTO {identifier(self.schema)}.{identifier(self.table_original)} + f"""INSERT INTO {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} VALUES ({values_str}) """, ) @@ -493,7 +493,7 @@ def run(self, transaction: Transaction) -> None: self.mz_conn.autocommit = True self.execute( cur, - f"""UPDATE {identifier(self.schema)}.{identifier(self.table_original)} + f"""UPDATE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} SET {set_str} WHERE {cond_str} """, @@ -505,7 +505,7 @@ def run(self, transaction: Transaction) -> None: ) self.execute( cur, - f"""INSERT INTO {identifier(self.schema)}.{identifier(self.table_original)} + f"""INSERT INTO {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} VALUES ({values_str}) """, ) @@ -519,7 +519,7 @@ def run(self, transaction: Transaction) -> None: self.mz_conn.autocommit = True self.execute( cur, - f"""DELETE FROM {identifier(self.schema)}.{identifier(self.table_original)} + f"""DELETE FROM {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} WHERE {cond_str} """, ) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index 895afa35f1ad0..aad7c581aaccc 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -21,9 +21,10 @@ from materialize.mzcompose.composition import Composition from materialize.mzcompose.services.minio import MINIO_BLOB_URI from materialize.parallel_workload.database import ( - DB_OFFSET, + DB, MAX_CLUSTER_REPLICAS, MAX_CLUSTERS, + MAX_DBS, MAX_KAFKA_SINKS, MAX_KAFKA_SOURCES, MAX_POSTGRES_SOURCES, @@ -438,6 +439,35 @@ def run(self, exe: Executor) -> None: raise +class CreateDatabaseAction(Action): + def run(self, exe: Executor) -> None: + with exe.db.lock: + if len(exe.db.dbs) > MAX_DBS: + return + db_id = exe.db.db_id + exe.db.db_id += 1 + db = DB(exe.db.seed, db_id) + db.create(exe) + exe.db.dbs.append(db) + + +class DropDatabaseAction(Action): + def errors_to_ignore(self, exe: Executor) -> list[str]: + return [ + "cannot be dropped with RESTRICT while it contains schemas", + ] + super().errors_to_ignore(exe) + + def run(self, exe: Executor) -> None: + with exe.db.lock: + if len(exe.db.dbs) <= 1: + return + db_id = self.rng.randrange(len(exe.db.dbs)) + db = exe.db.dbs[db_id] + query = f"DROP DATABASE {db} RESTRICT" + exe.execute(query) + del exe.db.dbs[db_id] + + class CreateSchemaAction(Action): def run(self, exe: Executor) -> None: with exe.db.lock: @@ -445,7 +475,7 @@ def run(self, exe: Executor) -> None: return schema_id = exe.db.schema_id exe.db.schema_id += 1 - schema = Schema(self.rng, schema_id) + schema = Schema(self.rng.choice(exe.db.dbs), schema_id) schema.create(exe) exe.db.schemas.append(schema) @@ -589,7 +619,7 @@ def run(self, exe: Executor) -> None: return role_id = exe.db.role_id exe.db.role_id += 1 - role = Role(exe.db.db_id * DB_OFFSET + role_id) + role = Role(role_id) role.create(exe) exe.db.roles.append(role) @@ -624,7 +654,7 @@ def run(self, exe: Executor) -> None: cluster_id = exe.db.cluster_id exe.db.cluster_id += 1 cluster = Cluster( - exe.db.db_id * DB_OFFSET + cluster_id, + cluster_id, managed=self.rng.choice([True, False]), size=self.rng.choice(["1", "2", "4"]), replication_factor=self.rng.choice([1, 2, 4, 5]), @@ -793,7 +823,7 @@ def run(self, exe: Executor) -> None: for i in range(NUM_ATTEMPTS): try: conn = pg8000.connect( - host=host, port=port, user=user, database=exe.db.name() + host=host, port=port, user=user, database="materialize" ) conn.autocommit = autocommit cur = conn.cursor() @@ -836,10 +866,10 @@ def run(self, exe: Executor) -> None: ) worker = None for i in range(len(self.workers)): - for worker_exe in self.workers[i].exes: - if worker_exe and worker_exe.pg_pid == pid: - worker = f"worker_{i}" - break + worker_exe = self.workers[i].exe + if worker_exe and worker_exe.pg_pid == pid: + worker = f"worker_{i}" + break assert worker exe.execute( f"SELECT pg_cancel_backend({pid})", extra_info=f"Canceling {worker}" @@ -870,22 +900,21 @@ def run(self, exe: Executor) -> None: # TODO: Don't restore immediately, keep copy Database objects class BackupRestoreAction(Action): composition: Composition - databases: list[Database] + db: Database num: int def __init__( - self, rng: random.Random, composition: Composition, databases: list[Database] + self, rng: random.Random, composition: Composition, db: Database ) -> None: super().__init__(rng) self.composition = composition - self.databases = databases + self.db = db self.num = 0 def run(self, exe: Executor) -> None: self.num += 1 time.sleep(self.rng.uniform(10, 240)) - for db in self.databases: - db.lock.acquire() + self.db.lock.acquire() try: # Backup @@ -928,8 +957,7 @@ def run(self, exe: Executor) -> None: self.composition.up("materialized") finally: - for db in self.databases: - db.lock.release() + self.db.lock.release() class CreateWebhookSourceAction(Action): @@ -984,8 +1012,7 @@ def run(self, exe: Executor) -> None: schema = self.rng.choice(exe.db.schemas) try: source = KafkaSource( - exe.db.name(), - exe.db.db_id * DB_OFFSET + source_id, + source_id, cluster, schema, exe.db.ports, @@ -1035,8 +1062,7 @@ def run(self, exe: Executor) -> None: cluster = self.rng.choice(potential_clusters) try: source = PostgresSource( - exe.db.name(), - exe.db.db_id * DB_OFFSET + source_id, + source_id, cluster, schema, exe.db.ports, @@ -1091,7 +1117,7 @@ def run(self, exe: Executor) -> None: cluster = self.rng.choice(potential_clusters) schema = self.rng.choice(exe.db.schemas) sink = KafkaSink( - exe.db.db_id * DB_OFFSET + sink_id, + sink_id, cluster, schema, self.rng.choice(exe.db.db_objects_without_views()), @@ -1243,6 +1269,8 @@ def __init__( (GrantPrivilegesAction, 4), (RevokePrivilegesAction, 1), (ReconnectAction, 1), + (CreateDatabaseAction, 1), + (DropDatabaseAction, 1), (CreateSchemaAction, 1), (DropSchemaAction, 1), (RenameSchemaAction, 10), diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index 50d18dc62f479..bfb43c0c4b95e 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -35,17 +35,19 @@ MAX_COLUMNS = 100 MAX_INCLUDE_HEADERS = 5 MAX_ROWS = 1000 -MAX_CLUSTERS = 5 -MAX_CLUSTER_REPLICAS = 3 -MAX_SCHEMAS = 10 -MAX_TABLES = 50 -MAX_VIEWS = 50 -MAX_ROLES = 50 -MAX_WEBHOOK_SOURCES = 10 -MAX_KAFKA_SOURCES = 10 -MAX_POSTGRES_SOURCES = 10 -MAX_KAFKA_SINKS = 10 - +MAX_CLUSTERS = 10 +MAX_CLUSTER_REPLICAS = 4 +MAX_DBS = 10 +MAX_SCHEMAS = 20 +MAX_TABLES = 100 +MAX_VIEWS = 100 +MAX_ROLES = 100 +MAX_WEBHOOK_SOURCES = 20 +MAX_KAFKA_SOURCES = 20 +MAX_POSTGRES_SOURCES = 20 +MAX_KAFKA_SINKS = 20 + +MAX_INITIAL_DBS = 1 MAX_INITIAL_SCHEMAS = 1 MAX_INITIAL_CLUSTERS = 2 MAX_INITIAL_TABLES = 10 @@ -56,8 +58,6 @@ MAX_INITIAL_POSTGRES_SOURCES = 3 MAX_INITIAL_KAFKA_SINKS = 3 -DB_OFFSET = 1_000_000 - NAUGHTY_IDENTIFIERS = False @@ -137,12 +137,35 @@ def create(self) -> str: return result +class DB: + seed: str + db_id: int + + def __init__(self, seed: str, db_id: int): + self.seed = seed + self.db_id = db_id + + def name(self) -> str: + return naughtify(f"db-pw-{self.seed}-{self.db_id}") + + def __str__(self) -> str: + return identifier(self.name()) + + def create(self, exe: Executor) -> None: + exe.execute(f"CREATE DATABASE {self}") + + def drop(self, exe: Executor) -> None: + exe.execute(f"DROP DATABASE IF EXISTS {self}") + + class Schema: schema_id: int rename: int + db: DB - def __init__(self, rng: random.Random, schema_id: int): + def __init__(self, db: DB, schema_id: int): self.schema_id = schema_id + self.db = db self.rename = 0 def name(self) -> str: @@ -151,7 +174,7 @@ def name(self) -> str: return naughtify(f"s-{self.schema_id}") def __str__(self) -> str: - return identifier(self.name()) + return f"{self.db}.{identifier(self.name())}" def create(self, exe: Executor) -> None: query = f"CREATE SCHEMA {self}" @@ -415,7 +438,6 @@ class KafkaSource(DBObject): def __init__( self, - database: str, source_id: int, cluster: "Cluster", schema: Schema, @@ -438,7 +460,7 @@ def __init__( KafkaColumn(field.name, field.data_type, False, self) for field in fields ] self.executor = KafkaExecutor( - self.source_id, ports, fields, database, schema.name() + self.source_id, ports, fields, schema.db.name(), schema.name() ) self.generator = rng.choice(list(WORKLOADS))(None).generate(fields) self.lock = threading.Lock() @@ -533,7 +555,6 @@ class PostgresSource(DBObject): def __init__( self, - database: str, source_id: int, cluster: "Cluster", schema: Schema, @@ -556,7 +577,7 @@ def __init__( PostgresColumn(field.name, field.data_type, False, self) for field in fields ] self.executor = PgExecutor( - self.source_id, ports, fields, database, schema.name() + self.source_id, ports, fields, schema.db.name(), schema.name() ) self.generator = rng.choice(list(WORKLOADS))(None).generate(fields) self.lock = threading.Lock() @@ -659,13 +680,14 @@ def create(self, exe: Executor) -> None: exe.execute(query) +# TODO: Can access both databases from same connection! class Database: - db_id: int - seed: str complexity: Complexity scenario: Scenario host: str ports: dict[str, int] + dbs: list[DB] + db_id: int schemas: list[Schema] schema_id: int tables: list[Table] @@ -686,10 +708,10 @@ class Database: kafka_sinks: list[KafkaSink] kafka_sink_id: int lock: threading.Lock + seed: str def __init__( self, - db_id: int, rng: random.Random, seed: str, host: str, @@ -699,16 +721,18 @@ def __init__( naughty_identifiers: bool, ): global NAUGHTY_IDENTIFIERS - self.db_id = db_id - self.seed = seed self.host = host self.ports = ports self.complexity = complexity self.scenario = scenario + self.seed = seed NAUGHTY_IDENTIFIERS = naughty_identifiers + self.dbs = [DB(seed, i) for i in range(rng.randint(1, MAX_INITIAL_DBS))] + self.db_id = len(self.dbs) self.schemas = [ - Schema(rng, i) for i in range(rng.randint(1, MAX_INITIAL_SCHEMAS)) + Schema(rng.choice(self.dbs), i) + for i in range(rng.randint(1, MAX_INITIAL_SCHEMAS)) ] self.schema_id = len(self.schemas) self.tables = [ @@ -727,15 +751,12 @@ def __init__( view = View(rng, i, base_object, base_object2, rng.choice(self.schemas)) self.views.append(view) self.view_id = len(self.views) - self.roles = [ - Role(db_id * DB_OFFSET + i) - for i in range(rng.randint(0, MAX_INITIAL_ROLES)) - ] + self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))] self.role_id = len(self.roles) # At least one storage cluster required for WebhookSources self.clusters = [ Cluster( - self.db_id * DB_OFFSET + i, + i, managed=rng.choice([True, False]), size=rng.choice(["1", "2", "4"]), replication_factor=1, @@ -752,8 +773,7 @@ def __init__( self.webhook_source_id = len(self.webhook_sources) self.kafka_sources = [ KafkaSource( - self.name(), - self.db_id * DB_OFFSET + i, + i, rng.choice(self.clusters), rng.choice(self.schemas), ports, @@ -764,8 +784,7 @@ def __init__( self.kafka_source_id = len(self.kafka_sources) self.postgres_sources = [ PostgresSource( - self.name(), - self.db_id * DB_OFFSET + i, + i, rng.choice(self.clusters), rng.choice(self.schemas), ports, @@ -787,12 +806,6 @@ def __init__( self.kafka_sink_id = len(self.kafka_sinks) self.lock = threading.Lock() - def name(self) -> str: - return naughtify(f"db-pw-{self.seed}-{self.db_id}") - - def __str__(self) -> str: - return identifier(self.name()) - def db_objects( self, ) -> list[WebhookSource | PostgresSource | KafkaSource | View | Table]: @@ -817,28 +830,24 @@ def __iter__(self): self.schemas + self.clusters + self.roles + self.db_objects() ).__iter__() - def drop(self, exe: Executor) -> None: - exe.execute(f"DROP DATABASE IF EXISTS {self}") - def create(self, exe: Executor) -> None: - self.drop(exe) - exe.execute(f"CREATE DATABASE {self}") - exe.execute(f"ALTER DATABASE {self} OWNER TO materialize") + for db in self.dbs: + db.drop(exe) + db.create(exe) - def create_relations(self, exe: Executor) -> None: - # Roles and clusters are system wide, not per DB - if self.db_id == 0: - exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") - for row in exe.cur.fetchall(): - exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE") + exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE") - exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") - for row in exe.cur.fetchall(): - exe.execute(f"DROP ROLE {identifier(row[0])}") + exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'") + for row in exe.cur.fetchall(): + exe.execute(f"DROP ROLE {identifier(row[0])}") - exe.execute("CREATE CONNECTION kafka_conn FOR KAFKA BROKER 'kafka:9092'") exe.execute( - "CREATE CONNECTION csr_conn FOR CONFLUENT SCHEMA REGISTRY URL 'http://schema-registry:8081'" + "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA BROKER 'kafka:9092'" + ) + exe.execute( + "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA REGISTRY URL 'http://schema-registry:8081'" ) print("Created connections") diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py index 1159c0bba159c..db7910b96a565 100644 --- a/misc/python/materialize/parallel_workload/executor.py +++ b/misc/python/materialize/parallel_workload/executor.py @@ -73,7 +73,7 @@ def log(self, msg: str) -> None: thread_name = threading.current_thread().getName() with lock: - print(f"[{thread_name}][{self.db.name()}] {msg}", file=logging) + print(f"[{thread_name}] {msg}", file=logging) logging.flush() def execute( diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 30f3fb5652eaf..029ae47861fa2 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -61,14 +61,13 @@ def run( scenario: Scenario, num_threads: int | None, naughty_identifiers: bool, - num_databases: int, composition: Composition | None, ) -> None: num_threads = num_threads or os.cpu_count() or 10 random.seed(seed) print( - f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}--databases={num_databases} (--host={host})" + f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})" ) initialize_logging() @@ -77,40 +76,29 @@ def run( ).timestamp() rng = random.Random(random.randrange(SEED_RANGE)) - databases = [ - Database(i, rng, seed, host, ports, complexity, scenario, naughty_identifiers) - for i in range(num_databases) - ] + database = Database( + rng, seed, host, ports, complexity, scenario, naughty_identifiers + ) system_conn = pg8000.connect( host=host, port=ports["mz_system"], user="mz_system", database="materialize" ) system_conn.autocommit = True with system_conn.cursor() as system_cur: - system_exe = Executor(rng, system_cur, databases[0]) + system_exe = Executor(rng, system_cur, database) system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true") system_exe.execute( f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 2}" ) # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES + system_exe.execute(f"ALTER SYSTEM SET max_tables = {MAX_TABLES * 2}") + system_exe.execute(f"ALTER SYSTEM SET max_materialized_views = {MAX_VIEWS * 2}") system_exe.execute( - f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}" - ) - system_exe.execute( - f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS * 2}" - ) - system_exe.execute( - f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}" - ) - system_exe.execute( - f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS * 2}" - ) - system_exe.execute( - f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES * 2}" - ) - system_exe.execute( - f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS * 2}" + f"ALTER SYSTEM SET max_sources = {(MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}" ) + system_exe.execute(f"ALTER SYSTEM SET max_sinks = {MAX_KAFKA_SINKS * 2}") + system_exe.execute(f"ALTER SYSTEM SET max_roles = {MAX_ROLES * 2}") + system_exe.execute(f"ALTER SYSTEM SET max_clusters = {MAX_CLUSTERS * 2}") system_exe.execute( f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}" ) @@ -136,20 +124,17 @@ def run( system_exe.execute( "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC" ) - for database in databases: - database.create(system_exe) - - conn = pg8000.connect( - host=host, - port=ports["materialized"], - user="materialize", - database=database.name(), - ) - conn.autocommit = True - with conn.cursor() as cur: - database.create_relations(Executor(rng, cur, database)) - conn.close() - system_conn.close() + system_conn.close() + conn = pg8000.connect( + host=host, + port=ports["materialized"], + user="materialize", + database="materialize", + ) + conn.autocommit = True + with conn.cursor() as cur: + database.create(Executor(rng, cur, database)) + conn.close() workers = [] threads = [] @@ -194,7 +179,7 @@ def run( thread = threading.Thread( name=thread_name, target=worker.run, - args=(host, ports["materialized"], "materialize", databases), + args=(host, ports["materialized"], "materialize", database), ) thread.start() threads.append(thread) @@ -212,7 +197,7 @@ def run( thread = threading.Thread( name="cancel", target=worker.run, - args=(host, ports["mz_system"], "mz_system", databases), + args=(host, ports["mz_system"], "mz_system", database), ) thread.start() threads.append(thread) @@ -230,7 +215,7 @@ def run( thread = threading.Thread( name="kill", target=worker.run, - args=(host, ports["materialized"], "materialize", databases), + args=(host, ports["materialized"], "materialize", database), ) thread.start() threads.append(thread) @@ -238,7 +223,7 @@ def run( assert composition, "Backup & Restore scenario only works in mzcompose" worker = Worker( worker_rng, - [BackupRestoreAction(worker_rng, composition, databases)], + [BackupRestoreAction(worker_rng, composition, database)], [1], end_time, autocommit=False, @@ -248,7 +233,7 @@ def run( thread = threading.Thread( name="kill", target=worker.run, - args=(host, ports["materialized"], "materialize", databases), + args=(host, ports["materialized"], "materialize", database), ) thread.start() threads.append(thread) @@ -286,9 +271,10 @@ def run( conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") conn.autocommit = True with conn.cursor() as cur: - for database in databases: - print(f"Dropping database {database}") - database.drop(Executor(rng, cur, database)) + exe = Executor(rng, cur, database) + print(f"Dropping database {database}") + for db in database.dbs: + db.drop(exe) conn.close() ignored_errors: defaultdict[str, Counter[type[Action]]] = defaultdict(Counter) @@ -336,12 +322,6 @@ def parse_common_args(parser: argparse.ArgumentParser) -> None: action="store_true", help="Whether to use naughty strings as identifiers, makes the queries unreadable", ) - parser.add_argument( - "--databases", - default=2, - type=int, - help="Number of databases to create and run against, 2 by default", - ) def main() -> int: @@ -390,7 +370,6 @@ def main() -> int: Scenario(args.scenario), args.threads, args.naughty_identifiers, - args.databases, composition=None, # only works in mzcompose ) return 0 diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py index 92eecd72fc8cc..83b7ac3e325f0 100644 --- a/misc/python/materialize/parallel_workload/worker.py +++ b/misc/python/materialize/parallel_workload/worker.py @@ -28,7 +28,7 @@ class Worker: num_queries: int autocommit: bool system: bool - exes: list[Executor] + exe: Executor | None ignored_errors: defaultdict[str, Counter[type[Action]]] def __init__( @@ -48,30 +48,26 @@ def __init__( self.autocommit = autocommit self.system = system self.ignored_errors = defaultdict(Counter) - self.exes = [] + self.exe = None - def run(self, host: str, port: int, user: str, databases: list[Database]) -> None: - self.conns = [ - pg8000.connect(host=host, port=port, user=user, database=database.name()) - for database in databases - ] - for database, conn in zip(databases, self.conns): - conn.autocommit = self.autocommit - cur = conn.cursor() - exe = Executor(self.rng, cur, database) - exe.set_isolation("SERIALIZABLE") - cur.execute("SELECT pg_backend_pid()") - exe.pg_pid = cur.fetchall()[0][0] - self.exes.append(exe) + def run(self, host: str, port: int, user: str, database: Database) -> None: + self.conn = pg8000.connect( + host=host, port=port, user=user, database="materialize" + ) + self.conn.autocommit = self.autocommit + cur = self.conn.cursor() + self.exe = Executor(self.rng, cur, database) + self.exe.set_isolation("SERIALIZABLE") + cur.execute("SELECT pg_backend_pid()") + self.exe.pg_pid = cur.fetchall()[0][0] while time.time() < self.end_time: - exe = self.rng.choice(self.exes) action = self.rng.choices(self.actions, self.weights)[0] self.num_queries += 1 try: - if exe.rollback_next: + if self.exe.rollback_next: try: - exe.rollback() + self.exe.rollback() except QueryError as e: if ( "Please disconnect and re-connect" in e.msg @@ -79,16 +75,16 @@ def run(self, host: str, port: int, user: str, databases: list[Database]) -> Non or "Can't create a connection to host" in e.msg or "Connection refused" in e.msg ): - exe.reconnect_next = True - exe.rollback_next = False + self.exe.reconnect_next = True + self.exe.rollback_next = False continue - exe.rollback_next = False - if exe.reconnect_next: - ReconnectAction(self.rng, random_role=False).run(exe) - exe.reconnect_next = False - action.run(exe) + self.exe.rollback_next = False + if self.exe.reconnect_next: + ReconnectAction(self.rng, random_role=False).run(self.exe) + self.exe.reconnect_next = False + action.run(self.exe) except QueryError as e: - for error in action.errors_to_ignore(exe): + for error in action.errors_to_ignore(self.exe): if error in e.msg: self.ignored_errors[error][type(action)] += 1 if ( @@ -97,13 +93,11 @@ def run(self, host: str, port: int, user: str, databases: list[Database]) -> Non or "Can't create a connection to host" in e.msg or "Connection refused" in e.msg ): - exe.reconnect_next = True + self.exe.reconnect_next = True else: - exe.rollback_next = True + self.exe.rollback_next = True break else: thread_name = threading.current_thread().getName() - print( - f"[{thread_name}][{exe.db.name()}] Query failed: {e.query} {e.msg}" - ) + print(f"[{thread_name}] Query failed: {e.query} {e.msg}") raise diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py index c4ee9f7b5974f..50dcadb0dd6cd 100644 --- a/test/parallel-workload/mzcompose.py +++ b/test/parallel-workload/mzcompose.py @@ -90,7 +90,6 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: Scenario(args.scenario), args.threads, args.naughty_identifiers, - args.databases, c, ) # TODO: Only ignore errors that will be handled by parallel-workload, not others From 94f10607be5aa992e85e7a0d426b6e6c8e66f973 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 27 Oct 2023 10:51:45 +0000 Subject: [PATCH 13/17] parallel-workload: Handle stuck queries --- .../materialize/data_ingest/executor.py | 2 +- .../materialize/parallel_workload/action.py | 92 ++++++++++--------- .../materialize/parallel_workload/executor.py | 3 + .../parallel_workload/parallel_workload.py | 15 ++- 4 files changed, 65 insertions(+), 47 deletions(-) diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py index 742619fae4543..33e269fc6962c 100644 --- a/misc/python/materialize/data_ingest/executor.py +++ b/misc/python/materialize/data_ingest/executor.py @@ -78,7 +78,7 @@ def execute(self, cur: pg8000.Cursor, query: str) -> None: with self.mz_conn.cursor() as cur: self.execute(cur, query) except Exception as e: - print(f"Query failed: {query}") + print(f"Query failed: {query} {e}") raise QueryError(str(e), query) def execute_with_retry_on_error( diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index aad7c581aaccc..cdab0052c5a5f 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -516,7 +516,9 @@ def run(self, exe: Executor) -> None: old_name = str(schema) schema.rename += 1 try: - exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}") + exe.execute( + f"ALTER SCHEMA {old_name} RENAME TO {identifier(schema.name())}" + ) except: schema.rename -= 1 raise @@ -524,24 +526,26 @@ def run(self, exe: Executor) -> None: class SwapSchemaAction(Action): def run(self, exe: Executor) -> None: - if self.db.scenario != Scenario.Rename: + if exe.db.scenario != Scenario.Rename: return - with self.db.lock: - if len(self.db.schemas) < 2: + with exe.db.lock: + db = self.rng.choice(exe.db.dbs) + schemas = [schema for schema in exe.db.schemas if schema.db == db] + if len(schemas) < 2: return - (i1, schema1), (i2, schema2) = self.rng.sample( - list(enumerate(self.db.schemas)), 2 - ) - self.db.schemas[i1], self.db.schemas[i2] = ( - self.db.schemas[i2], - self.db.schemas[i1], + (i1, schema1), (i2, schema2) = self.rng.sample(list(enumerate(schemas)), 2) + exe.db.schemas[i1], exe.db.schemas[i2] = ( + exe.db.schemas[i2], + exe.db.schemas[i1], ) try: - exe.execute(f"ALTER SCHEMA {schema1} SWAP WITH {schema2}") + exe.execute( + f"ALTER SCHEMA {schema1} SWAP WITH {identifier(schema2.name())}" + ) except: - self.db.schemas[i1], self.db.schemas[i2] = ( - self.db.schemas[i2], - self.db.schemas[i1], + exe.db.schemas[i1], exe.db.schemas[i2] = ( + exe.db.schemas[i2], + exe.db.schemas[i1], ) raise @@ -862,7 +866,7 @@ def __init__( def run(self, exe: Executor) -> None: pid = self.rng.choice( - [exe.pg_pid for worker in self.workers for exe in worker.exes if exe and exe.pg_pid != -1] # type: ignore + [worker.exe.pg_pid for worker in self.workers if worker.exe and worker.exe.pg_pid != -1] # type: ignore ) worker = None for i in range(len(self.workers)): @@ -1007,22 +1011,22 @@ def run(self, exe: Executor) -> None: return source_id = exe.db.kafka_source_id exe.db.kafka_source_id += 1 - potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] - cluster = self.rng.choice(potential_clusters) - schema = self.rng.choice(exe.db.schemas) - try: - source = KafkaSource( - source_id, - cluster, - schema, - exe.db.ports, - self.rng, - ) - source.create(exe) - exe.db.kafka_sources.append(source) - except: - if exe.db.scenario != Scenario.Kill: - raise + potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] + cluster = self.rng.choice(potential_clusters) + schema = self.rng.choice(exe.db.schemas) + try: + source = KafkaSource( + source_id, + cluster, + schema, + exe.db.ports, + self.rng, + ) + source.create(exe) + exe.db.kafka_sources.append(source) + except: + if exe.db.scenario != Scenario.Kill: + raise class DropKafkaSourceAction(Action): @@ -1060,19 +1064,19 @@ def run(self, exe: Executor) -> None: potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1] schema = self.rng.choice(exe.db.schemas) cluster = self.rng.choice(potential_clusters) - try: - source = PostgresSource( - source_id, - cluster, - schema, - exe.db.ports, - self.rng, - ) - source.create(exe) - exe.db.postgres_sources.append(source) - except: - if exe.db.scenario != Scenario.Kill: - raise + try: + source = PostgresSource( + source_id, + cluster, + schema, + exe.db.ports, + self.rng, + ) + source.create(exe) + exe.db.postgres_sources.append(source) + except: + if exe.db.scenario != Scenario.Kill: + raise class DropPostgresSourceAction(Action): diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py index db7910b96a565..b00b13aa2e1c3 100644 --- a/misc/python/materialize/parallel_workload/executor.py +++ b/misc/python/materialize/parallel_workload/executor.py @@ -37,6 +37,7 @@ class Executor: db: "Database" reconnect_next: bool rollback_next: bool + last_log: str def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"): self.rng = rng @@ -46,6 +47,7 @@ def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"): self.insert_table = None self.reconnect_next = True self.rollback_next = True + self.last_log = "" def set_isolation(self, level: str) -> None: self.execute(f"SET TRANSACTION_ISOLATION TO '{level}'") @@ -71,6 +73,7 @@ def log(self, msg: str) -> None: return thread_name = threading.current_thread().getName() + self.last_log = msg with lock: print(f"[{thread_name}] {msg}", file=logging) diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 029ae47861fa2..5895870d3b8e1 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -265,8 +265,19 @@ def run( for worker in workers: worker.end_time = time.time() - for thread in threads: - thread.join() + stopping_time = ( + datetime.datetime.now() + datetime.timedelta(seconds=300) + ).timestamp() + while time.time() < stopping_time: + for worker, thread in zip(workers, threads): + thread.join(timeout=1) + if thread.is_alive(): + print(f"{thread.name} still running: {worker.exe.last_log}") + if all([not thread.is_alive() for thread in threads]): + break + else: + print("Threads have not stopped within 5 minutes, exiting hard") + sys.exit(1) conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") conn.autocommit = True From 4509680454ecf82a836bc91d1357e567579f89b6 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 27 Oct 2023 10:53:57 +0000 Subject: [PATCH 14/17] Address reviewer comment --- .../parallel_workload/parallel_workload.py | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 5895870d3b8e1..86203548de8dc 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -103,27 +103,18 @@ def run( f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}" ) # Most queries should not fail because of privileges - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC" - ) - system_exe.execute( - "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC" - ) + for object_type in [ + "TABLES", + "TYPES", + "SECRETS", + "CONNECTIONS", + "DATABASES", + "SCHEMAS", + "CLUSTERS", + ]: + system_exe.execute( + f"ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON {object_type} TO PUBLIC" + ) system_conn.close() conn = pg8000.connect( host=host, From 28847a20d087c5d09aa2118c8e581b6e2925ad7c Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 27 Oct 2023 14:56:48 +0000 Subject: [PATCH 15/17] Better handling of existing errors --- .../materialize/parallel_workload/action.py | 50 ++++++++++--------- .../parallel_workload/parallel_workload.py | 9 ++-- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py index cdab0052c5a5f..4043488c22b71 100644 --- a/misc/python/materialize/parallel_workload/action.py +++ b/misc/python/materialize/parallel_workload/action.py @@ -88,7 +88,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]: "canceling statement due to user request", ] ) - if exe.db.scenario == Scenario.Kill: + if exe.db.scenario in (Scenario.Kill, Scenario.BackupRestore): result.extend( [ "network error", @@ -339,9 +339,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e exe.db.indexes.remove(index_name) @@ -374,9 +374,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e exe.db.tables.remove(table) @@ -573,8 +573,10 @@ def run(self, exe: Executor) -> None: exe.db.view_id += 1 # Don't use views for now since LIMIT 1 and statement_timeout are # not effective yet at preventing long-running queries and OoMs. - base_object = self.rng.choice(exe.db.db_objects()) - base_object2: DBObject | None = self.rng.choice(exe.db.db_objects()) + base_object = self.rng.choice(exe.db.db_objects_without_views()) + base_object2: DBObject | None = self.rng.choice( + exe.db.db_objects_without_views() + ) if self.rng.choice([True, False]) or base_object2 == base_object: base_object2 = None view = View( @@ -608,9 +610,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e del exe.db.views[view_id] @@ -996,9 +998,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e del exe.db.webhook_sources[source_id] @@ -1046,9 +1048,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e del exe.db.kafka_sources[source_id] @@ -1096,9 +1098,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e del exe.db.postgres_sources[source_id] @@ -1148,9 +1150,9 @@ def run(self, exe: Executor) -> None: exe.execute(query) except QueryError as e: # expected, see #20465 - if ( - exe.db.scenario != Scenario.Kill - or "unknown catalog item" not in e.msg + if exe.db.scenario != Scenario.Kill or ( + "unknown catalog item" not in e.msg + and "unknown schema" not in e.msg ): raise e del exe.db.kafka_sinks[sink_id] diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 86203548de8dc..9ad024b3a37d0 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -260,15 +260,16 @@ def run( datetime.datetime.now() + datetime.timedelta(seconds=300) ).timestamp() while time.time() < stopping_time: - for worker, thread in zip(workers, threads): + for thread in threads: thread.join(timeout=1) - if thread.is_alive(): - print(f"{thread.name} still running: {worker.exe.last_log}") if all([not thread.is_alive() for thread in threads]): break else: + for worker, thread in zip(workers, threads): + if thread.is_alive(): + print(f"{thread.name} still running: {worker.exe.last_log}") print("Threads have not stopped within 5 minutes, exiting hard") - sys.exit(1) + os._exit(1) conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") conn.autocommit = True From 7ae02eb1670234fa45b2742385fe11e84c4b0341 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 27 Oct 2023 14:55:34 +0000 Subject: [PATCH 16/17] Make parallel-workload reproducible hash and set/dict order are not deterministic between runs --- .../python/materialize/data_ingest/data_type.py | 17 ++++++++++------- .../materialize/parallel_workload/database.py | 4 +++- .../parallel_workload/parallel_workload.py | 5 ++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/misc/python/materialize/data_ingest/data_type.py b/misc/python/materialize/data_ingest/data_type.py index 69e0963fd2e97..ece463df8d8e9 100644 --- a/misc/python/materialize/data_ingest/data_type.py +++ b/misc/python/materialize/data_ingest/data_type.py @@ -60,7 +60,7 @@ def random_value( record_size: RecordSize = RecordSize.LARGE, in_query: bool = False, ) -> Any: - return random.choice((True, False)) + return rng.choice((True, False)) @staticmethod def name(backend: Backend = Backend.POSTGRES) -> str: @@ -243,13 +243,13 @@ def random_value( # chars = string.printable chars = string.ascii_letters + string.digits if record_size == RecordSize.TINY: - result = random.choice(("foo", "bar", "baz")) + result = rng.choice(("foo", "bar", "baz")) elif record_size == RecordSize.SMALL: - result = "".join(random.choice(chars) for _ in range(3)) + result = "".join(rng.choice(chars) for _ in range(3)) elif record_size == RecordSize.MEDIUM: - result = "".join(random.choice(chars) for _ in range(10)) + result = "".join(rng.choice(chars) for _ in range(10)) elif record_size == RecordSize.LARGE: - result = "".join(random.choice(chars) for _ in range(100)) + result = "".join(rng.choice(chars) for _ in range(100)) else: raise ValueError(f"Unexpected record size {record_size}") @@ -357,10 +357,13 @@ def numeric_value(num: int, in_query: bool = False) -> Any: return f"'{values_str}'::map[text=>text]" if in_query else values_str -DATA_TYPES = list(all_subclasses(DataType)) +# Sort to keep determinism for reproducible runs with specific seed +DATA_TYPES = sorted(list(all_subclasses(DataType)), key=repr) # fastavro._schema_common.UnknownType: record # bytea requires Python bytes type instead of str -DATA_TYPES_FOR_AVRO = list(set(DATA_TYPES) - {TextTextMap, Jsonb, Bytea, Boolean}) +DATA_TYPES_FOR_AVRO = sorted( + list(set(DATA_TYPES) - {TextTextMap, Jsonb, Bytea, Boolean}), key=repr +) NUMBER_TYPES = [SmallInt, Int, Long, Float, Double] diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py index bfb43c0c4b95e..edfeb49fd4bef 100644 --- a/misc/python/materialize/parallel_workload/database.py +++ b/misc/python/materialize/parallel_workload/database.py @@ -71,7 +71,9 @@ def naughtify(name: str) -> str: strings = naughty_strings() # This rng is just to get a more interesting integer for the name - index = abs(hash(name)) % len(strings) + index = sum([10**i * c for i, c in enumerate(name.encode("utf-8"))]) % len( + strings + ) # Keep them short so we can combine later with other identifiers, 255 char limit return f"{name}_{strings[index].encode('utf-8')[:16].decode('utf-8', 'ignore')}" diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 9ad024b3a37d0..2d2efe766b3f0 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -129,7 +129,6 @@ def run( workers = [] threads = [] - worker_rng = random.Random(rng.randrange(SEED_RANGE)) for i in range(num_threads): weights: list[float] if complexity == Complexity.DDL: @@ -140,6 +139,7 @@ def run( weights = [60, 30, 0, 0, 0] else: raise ValueError(f"Unknown complexity {complexity}") + worker_rng = random.Random(rng.randrange(SEED_RANGE)) action_list = worker_rng.choices( [ read_action_list, @@ -176,6 +176,7 @@ def run( threads.append(thread) if scenario == Scenario.Cancel: + worker_rng = random.Random(rng.randrange(SEED_RANGE)) worker = Worker( worker_rng, [CancelAction(worker_rng, workers)], @@ -193,6 +194,7 @@ def run( thread.start() threads.append(thread) elif scenario == Scenario.Kill: + worker_rng = random.Random(rng.randrange(SEED_RANGE)) assert composition, "Kill scenario only works in mzcompose" worker = Worker( worker_rng, @@ -211,6 +213,7 @@ def run( thread.start() threads.append(thread) elif scenario == Scenario.BackupRestore: + worker_rng = random.Random(rng.randrange(SEED_RANGE)) assert composition, "Backup & Restore scenario only works in mzcompose" worker = Worker( worker_rng, From a1ba25382b279afa164237f7d964a9ad21377162 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Fri, 27 Oct 2023 22:58:15 +0000 Subject: [PATCH 17/17] Workaround for 22717 --- .../materialize/parallel_workload/parallel_workload.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py index 2d2efe766b3f0..ff31d19ed6c7f 100644 --- a/misc/python/materialize/parallel_workload/parallel_workload.py +++ b/misc/python/materialize/parallel_workload/parallel_workload.py @@ -67,7 +67,7 @@ def run( random.seed(seed) print( - f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})" + f"+++ Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})" ) initialize_logging() @@ -272,7 +272,8 @@ def run( if thread.is_alive(): print(f"{thread.name} still running: {worker.exe.last_log}") print("Threads have not stopped within 5 minutes, exiting hard") - os._exit(1) + # TODO(def-): Switch to failing exit code when https://github.com/MaterializeInc/materialize/issues/22717 is fixed + os._exit(0) conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize") conn.autocommit = True