From 75e160d9894ec829d7efab3380a492ef4a85c3e1 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 20 Oct 2023 16:29:04 +0000
Subject: [PATCH 01/17] parallel-workload: Less locking

---
 ci/nightly/pipeline.template.yml              |  12 +-
 .../materialize/parallel_workload/action.py   | 276 +++++++++++-------
 .../materialize/parallel_workload/database.py |  44 +++
 .../parallel_workload/parallel_workload.py    |  27 +-
 .../materialize/parallel_workload/settings.py |   1 +
 5 files changed, 231 insertions(+), 129 deletions(-)

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index e4fbb28ec216b..ba7813703d038 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -871,7 +871,17 @@ steps:
           - ./ci/plugins/mzcompose:
               composition: parallel-workload
               args: [--runtime=1500, --scenario=kill]
-        skip: "TODO(def-) Enable after figuring out restoring catalog"
+
+      - id: parallel-workload-backup-restore
+        label: "Parallel Workload (backup & restore)"
+        artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
+        timeout_in_minutes: 40
+        agents:
+          queue: builder-linux-x86_64
+        plugins:
+          - ./ci/plugins/mzcompose:
+              composition: parallel-workload
+              args: [--runtime=1500, --scenario=backup-restore]
 
   - id: incident-70
     label: "Test for incident 70"
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 6de7c2c586d30..7b3a045c17b0e 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -231,9 +231,9 @@ def run(self, exe: Executor) -> None:
             if not sources:
                 return
             source = self.rng.choice(sources)
+        with source.lock:
             transaction = next(source.generator)
-            with source.lock:
-                source.executor.run(transaction)
+            source.executor.run(transaction)
 
 
 class UpdateAction(Action):
@@ -341,14 +341,13 @@ def run(self, exe: Executor) -> None:
 
 class CreateTableAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.tables) > MAX_TABLES:
-                return
-            table_id = self.db.table_id
-            self.db.table_id += 1
-            table = Table(self.rng, table_id, self.rng.choice(self.db.schemas))
-            table.create(exe)
-            self.db.tables.append(table)
+        if len(self.db.tables) > MAX_TABLES:
+            return
+        table_id = self.db.table_id
+        self.db.table_id += 1
+        table = Table(self.rng, table_id, self.rng.choice(self.db.schemas))
+        table.create(exe)
+        self.db.tables.append(table)
 
 
 class DropTableAction(Action):
@@ -361,11 +360,10 @@ def run(self, exe: Executor) -> None:
         with self.db.lock:
             if len(self.db.tables) <= 2:
                 return
-            table_id = self.rng.randrange(len(self.db.tables))
-            table = self.db.tables[table_id]
+            table = self.rng.choice(self.db.tables)
             query = f"DROP TABLE {table}"
             exe.execute(query)
-            del self.db.tables[table_id]
+            self.db.tables.remove(table)
 
 
 class RenameTableAction(Action):
@@ -376,17 +374,51 @@ def run(self, exe: Executor) -> None:
             if not self.db.tables:
                 return
             table = self.rng.choice(self.db.tables)
-            old_name = str(table)
-            table.rename += 1
+        old_name = str(table)
+        table.rename += 1
+        try:
+            exe.execute(f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}")
+        except:
+            table.rename -= 1
+            raise
+
+
+class RenameViewAction(Action):
+    def run(self, exe: Executor) -> None:
+        if self.db.scenario != Scenario.Rename:
+            return
+        with self.db.lock:
+            if not self.db.views:
+                return
+            view = self.rng.choice(self.db.views)
+            old_name = str(view)
+            view.rename += 1
             try:
                 exe.execute(
-                    f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}"
+                    f"ALTER {'MATERIALIZED VIEW' if view.materialized else 'VIEW'} {old_name} RENAME TO {identifier(view.name())}"
                 )
             except:
-                table.rename -= 1
+                view.rename -= 1
                 raise
 
 
+class RenameSinkAction(Action):
+    def run(self, exe: Executor) -> None:
+        if self.db.scenario != Scenario.Rename:
+            return
+        with self.db.lock:
+            if not self.db.kafka_sinks:
+                return
+            sink = self.rng.choice(self.db.kafka_sinks)
+        old_name = str(sink)
+        sink.rename += 1
+        try:
+            exe.execute(f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}")
+        except:
+            sink.rename -= 1
+            raise
+
+
 class CreateSchemaAction(Action):
     def run(self, exe: Executor) -> None:
         with self.db.lock:
@@ -394,9 +426,9 @@ def run(self, exe: Executor) -> None:
                 return
             schema_id = self.db.schema_id
             self.db.schema_id += 1
-            schema = Schema(self.rng, schema_id)
-            schema.create(exe)
-            self.db.schemas.append(schema)
+        schema = Schema(self.rng, schema_id)
+        schema.create(exe)
+        self.db.schemas.append(schema)
 
 
 class DropSchemaAction(Action):
@@ -422,13 +454,13 @@ def run(self, exe: Executor) -> None:
             return
         with self.db.lock:
             schema = self.rng.choice(self.db.schemas)
-            old_name = str(schema)
-            schema.rename += 1
-            try:
-                exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}")
-            except:
-                schema.rename -= 1
-                raise
+        old_name = str(schema)
+        schema.rename += 1
+        try:
+            exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}")
+        except:
+            schema.rename -= 1
+            raise
 
 
 class SwapSchemaAction(Action):
@@ -476,21 +508,21 @@ def run(self, exe: Executor) -> None:
                 return
             view_id = self.db.view_id
             self.db.view_id += 1
-            # Don't use views for now since LIMIT 1 and statement_timeout are
-            # not effective yet at preventing long-running queries and OoMs.
-            base_object = self.rng.choice(self.db.db_objects())
-            base_object2: DBObject | None = self.rng.choice(self.db.db_objects())
-            if self.rng.choice([True, False]) or base_object2 == base_object:
-                base_object2 = None
-            view = View(
-                self.rng,
-                view_id,
-                base_object,
-                base_object2,
-                self.rng.choice(self.db.schemas),
-            )
-            view.create(exe)
-            self.db.views.append(view)
+        # Don't use views for now since LIMIT 1 and statement_timeout are
+        # not effective yet at preventing long-running queries and OoMs.
+        base_object = self.rng.choice(self.db.db_objects())
+        base_object2: DBObject | None = self.rng.choice(self.db.db_objects())
+        if self.rng.choice([True, False]) or base_object2 == base_object:
+            base_object2 = None
+        view = View(
+            self.rng,
+            view_id,
+            base_object,
+            base_object2,
+            self.rng.choice(self.db.schemas),
+        )
+        view.create(exe)
+        self.db.views.append(view)
 
 
 class DropViewAction(Action):
@@ -520,9 +552,9 @@ def run(self, exe: Executor) -> None:
                 return
             role_id = self.db.role_id
             self.db.role_id += 1
-            role = Role(role_id)
-            role.create(exe)
-            self.db.roles.append(role)
+        role = Role(role_id)
+        role.create(exe)
+        self.db.roles.append(role)
 
 
 class DropRoleAction(Action):
@@ -550,15 +582,15 @@ def run(self, exe: Executor) -> None:
                 return
             cluster_id = self.db.cluster_id
             self.db.cluster_id += 1
-            cluster = Cluster(
-                cluster_id,
-                managed=self.rng.choice([True, False]),
-                size=self.rng.choice(["1", "2", "4"]),
-                replication_factor=self.rng.choice([1, 2, 4, 5]),
-                introspection_interval=self.rng.choice(["0", "1s", "10s"]),
-            )
-            cluster.create(exe)
-            self.db.clusters.append(cluster)
+        cluster = Cluster(
+            cluster_id,
+            managed=self.rng.choice([True, False]),
+            size=self.rng.choice(["1", "2", "4"]),
+            replication_factor=self.rng.choice([1, 2, 4, 5]),
+            introspection_interval=self.rng.choice(["0", "1s", "10s"]),
+        )
+        cluster.create(exe)
+        self.db.clusters.append(cluster)
 
 
 class DropClusterAction(Action):
@@ -591,16 +623,20 @@ def run(self, exe: Executor) -> None:
             if not self.db.clusters:
                 return
             cluster = self.rng.choice(self.db.clusters)
-            query = f"SET CLUSTER = {cluster}"
-            exe.execute(query)
+        query = f"SET CLUSTER = {cluster}"
+        exe.execute(query)
 
 
 class CreateClusterReplicaAction(Action):
     def errors_to_ignore(self) -> list[str]:
-        return [
-            "cannot create more than one replica of a cluster containing sources or sinks"
+        result = [
+            "cannot create more than one replica of a cluster containing sources or sinks",
+            # Can happen with reduced locking
+            "cannot create multiple replicas named",
         ] + super().errors_to_ignore()
 
+        return result
+
     def run(self, exe: Executor) -> None:
         with self.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
@@ -615,9 +651,13 @@ def run(self, exe: Executor) -> None:
                 size=self.rng.choice(["1", "2", "4"]),
                 cluster=cluster,
             )
+            cluster.replica_id += 1
+        try:
             replica.create(exe)
             cluster.replicas.append(replica)
-            cluster.replica_id += 1
+        except:
+            cluster.replica_id -= 1
+            raise
 
 
 class DropClusterReplicaAction(Action):
@@ -631,11 +671,10 @@ def run(self, exe: Executor) -> None:
             # Avoid "has no replicas available to service request" error
             if len(cluster.replicas) <= 1:
                 return
-            replica_id = self.rng.randrange(len(cluster.replicas))
-            replica = cluster.replicas[replica_id]
+            replica = self.rng.choice(cluster.replicas)
             query = f"DROP CLUSTER REPLICA {cluster}.{replica}"
             exe.execute(query)
-            del cluster.replicas[replica_id]
+            cluster.replicas.remove(replica)
 
 
 class GrantPrivilegesAction(Action):
@@ -644,11 +683,11 @@ def run(self, exe: Executor) -> None:
             if not self.db.roles:
                 return
             role = self.rng.choice(self.db.roles)
-            privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
-            tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
-            table = self.rng.choice(tables_views)
-            query = f"GRANT {privilege} ON {table} TO {role}"
-            exe.execute(query)
+        privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
+        tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
+        table = self.rng.choice(tables_views)
+        query = f"GRANT {privilege} ON {table} TO {role}"
+        exe.execute(query)
 
 
 class RevokePrivilegesAction(Action):
@@ -657,11 +696,11 @@ def run(self, exe: Executor) -> None:
             if not self.db.roles:
                 return
             role = self.rng.choice(self.db.roles)
-            privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
-            tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
-            table = self.rng.choice(tables_views)
-            query = f"REVOKE {privilege} ON {table} FROM {role}"
-            exe.execute(query)
+        privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
+        tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
+        table = self.rng.choice(tables_views)
+        query = f"REVOKE {privilege} ON {table} FROM {role}"
+        exe.execute(query)
 
 
 # TODO: Should factor this out so can easily use it without action
@@ -774,7 +813,7 @@ def run(self, exe: Executor) -> None:
         # Otherwise getting failure on "up" locally
         time.sleep(1)
         self.composition.up("materialized", detach=True)
-        time.sleep(self.rng.uniform(20, 60))
+        time.sleep(self.rng.uniform(20, 180))
 
 
 class CreateWebhookSourceAction(Action):
@@ -784,12 +823,12 @@ def run(self, exe: Executor) -> None:
                 return
             webhook_source_id = self.db.webhook_source_id
             self.db.webhook_source_id += 1
-            potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
-            cluster = self.rng.choice(potential_clusters)
-            schema = self.rng.choice(self.db.schemas)
-            source = WebhookSource(webhook_source_id, cluster, schema, self.rng)
-            source.create(exe)
-            self.db.webhook_sources.append(source)
+        potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
+        cluster = self.rng.choice(potential_clusters)
+        schema = self.rng.choice(self.db.schemas)
+        source = WebhookSource(webhook_source_id, cluster, schema, self.rng)
+        source.create(exe)
+        self.db.webhook_sources.append(source)
 
 
 class DropWebhookSourceAction(Action):
@@ -816,14 +855,18 @@ def run(self, exe: Executor) -> None:
                 return
             source_id = self.db.kafka_source_id
             self.db.kafka_source_id += 1
-            potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
-            cluster = self.rng.choice(potential_clusters)
-            schema = self.rng.choice(self.db.schemas)
+        potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
+        cluster = self.rng.choice(potential_clusters)
+        schema = self.rng.choice(self.db.schemas)
+        try:
             source = KafkaSource(
                 self.db.name(), source_id, cluster, schema, self.db.ports, self.rng
             )
             source.create(exe)
             self.db.kafka_sources.append(source)
+        except:
+            if self.db.scenario != Scenario.Kill:
+                raise
 
 
 class DropKafkaSourceAction(Action):
@@ -853,11 +896,15 @@ def run(self, exe: Executor) -> None:
             potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
             schema = self.rng.choice(self.db.schemas)
             cluster = self.rng.choice(potential_clusters)
+        try:
             source = PostgresSource(
                 self.db.name(), source_id, cluster, schema, self.db.ports, self.rng
             )
             source.create(exe)
             self.db.postgres_sources.append(source)
+        except:
+            if self.db.scenario != Scenario.Kill:
+                raise
 
 
 class DropPostgresSourceAction(Action):
@@ -878,6 +925,12 @@ def run(self, exe: Executor) -> None:
 
 
 class CreateKafkaSinkAction(Action):
+    def errors_to_ignore(self) -> list[str]:
+        return [
+            # Another replica can be created in parallel
+            "cannot create sink in cluster with more than one replica",
+        ] + super().errors_to_ignore()
+
     def run(self, exe: Executor) -> None:
         with self.db.lock:
             if len(self.db.kafka_sinks) > MAX_KAFKA_SINKS:
@@ -887,15 +940,15 @@ def run(self, exe: Executor) -> None:
             potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
             cluster = self.rng.choice(potential_clusters)
             schema = self.rng.choice(self.db.schemas)
-            sink = KafkaSink(
-                sink_id,
-                cluster,
-                schema,
-                self.rng.choice(self.db.db_objects_without_views()),
-                self.rng,
-            )
-            sink.create(exe)
-            self.db.kafka_sinks.append(sink)
+        sink = KafkaSink(
+            sink_id,
+            cluster,
+            schema,
+            self.rng.choice(self.db.db_objects_without_views()),
+            self.rng,
+        )
+        sink.create(exe)
+        self.db.kafka_sinks.append(sink)
 
 
 class DropKafkaSinkAction(Action):
@@ -922,26 +975,31 @@ def run(self, exe: Executor) -> None:
                 return
 
             source = self.rng.choice(self.db.webhook_sources)
-            url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}"
+        url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}"
 
-            payload = source.body_format.to_data_type().random_value(self.rng)
+        payload = source.body_format.to_data_type().random_value(self.rng)
 
-            header_fields = source.explicit_include_headers
-            if source.include_headers:
-                header_fields.extend(
-                    ["timestamp", "x-event-type", "signature", "x-mz-api-key"]
-                )
+        header_fields = source.explicit_include_headers
+        if source.include_headers:
+            header_fields.extend(
+                ["timestamp", "x-event-type", "signature", "x-mz-api-key"]
+            )
 
-            headers = {
-                header: f'"{Text.random_value(self.rng)}"'.encode()
-                for header in self.rng.sample(header_fields, len(header_fields))
-            }
+        headers = {
+            header: f'"{Text.random_value(self.rng)}"'.encode()
+            for header in self.rng.sample(header_fields, len(header_fields))
+        }
 
-            headers_strs = [f"{key}: {value}" for key, value in enumerate(headers)]
-            exe.log(
-                f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}"
-            )
+        headers_strs = [f"{key}: {value}" for key, value in enumerate(headers)]
+        exe.log(
+            f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}"
+        )
+        try:
             requests.post(url, data=payload.encode("utf-8"), headers=headers)
+        except requests.exceptions.ConnectionError:
+            # Expeceted when Mz is killed
+            if self.db.scenario != Scenario.Kill:
+                raise
 
 
 class ActionList:
@@ -1012,8 +1070,8 @@ def __init__(
         (DropRoleAction, 1),
         (CreateClusterAction, 2),
         (DropClusterAction, 1),
-        (CreateClusterReplicaAction, 8),
-        (DropClusterReplicaAction, 4),
+        (CreateClusterReplicaAction, 4),
+        (DropClusterReplicaAction, 2),
         (SetClusterAction, 1),
         (CreateWebhookSourceAction, 2),
         (DropWebhookSourceAction, 1),
@@ -1030,6 +1088,8 @@ def __init__(
         (DropSchemaAction, 1),
         (RenameSchemaAction, 10),
         (RenameTableAction, 10),
+        (RenameViewAction, 10),
+        (RenameSinkAction, 10),
         (SwapSchemaAction, 10),
         # (TransactionIsolationAction, 1),
     ],
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index 35aa59f3f33dd..e3e5fb8853548 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -210,6 +210,7 @@ class View(DBObject):
     join_column: Column | None
     join_column2: Column | None
     assert_not_null: list[Column]
+    rename: int
     schema: Schema
 
     def __init__(
@@ -220,6 +221,7 @@ def __init__(
         base_object2: DBObject | None,
         schema: Schema,
     ):
+        self.rename = 0
         self.view_id = view_id
         self.base_object = base_object
         self.base_object2 = base_object2
@@ -262,6 +264,8 @@ def __init__(
                 self.join_column2 = rng.choice(columns)
 
     def name(self) -> str:
+        if self.rename:
+            return naughtify(f"v-{self.view_id}-{self.rename}")
         return naughtify(f"v-{self.view_id}")
 
     def __str__(self) -> str:
@@ -578,13 +582,17 @@ class ClusterReplica:
     replica_id: int
     size: str
     cluster: "Cluster"
+    rename: int
 
     def __init__(self, replica_id: int, size: str, cluster: "Cluster"):
         self.replica_id = replica_id
         self.size = size
         self.cluster = cluster
+        self.rename = 0
 
     def name(self) -> str:
+        if self.rename:
+            return naughtify(f"r-{self.replica_id+1}-{self.rename}")
         return naughtify(f"r-{self.replica_id+1}")
 
     def __str__(self) -> str:
@@ -604,6 +612,7 @@ class Cluster:
     replicas: list[ClusterReplica]
     replica_id: int
     introspection_interval: str
+    rename: int
 
     def __init__(
         self,
@@ -621,8 +630,11 @@ def __init__(
         ]
         self.replica_id = len(self.replicas)
         self.introspection_interval = introspection_interval
+        self.rename = 0
 
     def name(self) -> str:
+        if self.rename:
+            return naughtify(f"cluster-{self.cluster_id}-{self.rename}")
         return naughtify(f"cluster-{self.cluster_id}")
 
     def __str__(self) -> str:
@@ -798,7 +810,39 @@ def drop(self, exe: Executor) -> None:
 
     def create(self, exe: Executor) -> None:
         self.drop(exe)
+        exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
+        exe.execute("ALTER SYSTEM SET max_schemas_per_database = 105")
+        # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
+        exe.execute("ALTER SYSTEM SET max_tables = 200")
+        exe.execute("ALTER SYSTEM SET max_materialized_views = 105")
+        exe.execute("ALTER SYSTEM SET max_sources = 105")
+        exe.execute("ALTER SYSTEM SET max_roles = 105")
+        exe.execute("ALTER SYSTEM SET max_clusters = 105")
+        exe.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105")
+        # Most queries should not fail because of privileges
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC"
+        )
+        exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC"
+        )
         exe.execute(f"CREATE DATABASE {self}")
+        exe.execute(f"ALTER DATABASE {self} OWNER TO materialize")
 
     def create_relations(self, exe: Executor) -> None:
         exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index b37033df48088..0895e107aaa23 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -58,22 +58,6 @@ def run(
     )
     initialize_logging()
 
-    system_conn = pg8000.connect(
-        host=host, port=ports["mz_system"], user="mz_system", database="materialize"
-    )
-    system_conn.autocommit = True
-    with system_conn.cursor() as cur:
-        cur.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
-        cur.execute("ALTER SYSTEM SET max_schemas_per_database = 105")
-        # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
-        cur.execute("ALTER SYSTEM SET max_tables = 200")
-        cur.execute("ALTER SYSTEM SET max_materialized_views = 105")
-        cur.execute("ALTER SYSTEM SET max_sources = 105")
-        cur.execute("ALTER SYSTEM SET max_roles = 105")
-        cur.execute("ALTER SYSTEM SET max_clusters = 105")
-        cur.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105")
-    system_conn.close()
-
     end_time = (
         datetime.datetime.now() + datetime.timedelta(seconds=runtime)
     ).timestamp()
@@ -82,11 +66,14 @@ def run(
     database = Database(
         rng, seed, host, ports, complexity, scenario, naughty_identifiers
     )
-    conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
-    conn.autocommit = True
-    with conn.cursor() as cur:
+
+    system_conn = pg8000.connect(
+        host=host, port=ports["mz_system"], user="mz_system", database="materialize"
+    )
+    system_conn.autocommit = True
+    with system_conn.cursor() as cur:
         database.create(Executor(rng, cur))
-    conn.close()
+    system_conn.close()
 
     conn = pg8000.connect(
         host=host,
diff --git a/misc/python/materialize/parallel_workload/settings.py b/misc/python/materialize/parallel_workload/settings.py
index 67411f6baabea..7e61888ad499f 100644
--- a/misc/python/materialize/parallel_workload/settings.py
+++ b/misc/python/materialize/parallel_workload/settings.py
@@ -21,3 +21,4 @@ class Scenario(Enum):
     Cancel = "cancel"
     Kill = "kill"
     Rename = "rename"
+    BackupRestore = "backup-restore"

From 73be019e76279f5018600b293137004658356cfe Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Mon, 23 Oct 2023 07:44:23 +0000
Subject: [PATCH 02/17] parallel-workload: Handle drops and other errors in
 kill scenario

---
 .../materialize/parallel_workload/action.py   | 88 +++++++++++++++----
 1 file changed, 71 insertions(+), 17 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 7b3a045c17b0e..3fe6e57386fa4 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -49,11 +49,6 @@
 if TYPE_CHECKING:
     from materialize.parallel_workload.worker import Worker
 
-# TODO: In kill scenario drops can be successful, but we might never know, see
-# https://github.com/MaterializeInc/materialize/issues/20465 We should handle
-# this by rescanning objects we expect to be there and removing the ones that
-# were dropped. This also has the risk that objects get lost as a bug though.
-
 # TODO: CASCADE in DROPs, keep track of what will be deleted
 class Action:
     rng: random.Random
@@ -335,7 +330,12 @@ def run(self, exe: Executor) -> None:
                 return
             index_name = self.rng.choice(list(self.db.indexes))
             query = f"DROP INDEX {identifier(index_name)}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             self.db.indexes.remove(index_name)
 
 
@@ -362,7 +362,12 @@ def run(self, exe: Executor) -> None:
                 return
             table = self.rng.choice(self.db.tables)
             query = f"DROP TABLE {table}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             self.db.tables.remove(table)
 
 
@@ -444,11 +449,21 @@ def run(self, exe: Executor) -> None:
             schema_id = self.rng.randrange(len(self.db.schemas))
             schema = self.db.schemas[schema_id]
             query = f"DROP SCHEMA {schema}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown schema" not in e.msg:
+                    raise e
             del self.db.schemas[schema_id]
 
 
 class RenameSchemaAction(Action):
+    def errors_to_ignore(self) -> list[str]:
+        return [
+            "ambiguous reference to schema named"  # see https://github.com/MaterializeInc/materialize/pull/22551#pullrequestreview-1691876923
+        ] + super().errors_to_ignore()
+
     def run(self, exe: Executor) -> None:
         if self.db.scenario != Scenario.Rename:
             return
@@ -541,7 +556,12 @@ def run(self, exe: Executor) -> None:
                 query = f"DROP MATERIALIZED VIEW {view}"
             else:
                 query = f"DROP VIEW {view}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             del self.db.views[view_id]
 
 
@@ -571,7 +591,12 @@ def run(self, exe: Executor) -> None:
             role_id = self.rng.randrange(len(self.db.roles))
             role = self.db.roles[role_id]
             query = f"DROP ROLE {role}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown role" not in e.msg:
+                    raise e
             del self.db.roles[role_id]
 
 
@@ -608,7 +633,12 @@ def run(self, exe: Executor) -> None:
             cluster_id = self.rng.randrange(1, len(self.db.clusters))
             cluster = self.db.clusters[cluster_id]
             query = f"DROP CLUSTER {cluster}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg:
+                    raise e
             del self.db.clusters[cluster_id]
 
 
@@ -673,10 +703,14 @@ def run(self, exe: Executor) -> None:
                 return
             replica = self.rng.choice(cluster.replicas)
             query = f"DROP CLUSTER REPLICA {cluster}.{replica}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "has no CLUSTER REPLICA named" not in e.msg:
+                    raise e
             cluster.replicas.remove(replica)
 
-
 class GrantPrivilegesAction(Action):
     def run(self, exe: Executor) -> None:
         with self.db.lock:
@@ -844,7 +878,12 @@ def run(self, exe: Executor) -> None:
             source_id = self.rng.randrange(len(self.db.webhook_sources))
             source = self.db.webhook_sources[source_id]
             query = f"DROP SOURCE {source}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             del self.db.webhook_sources[source_id]
 
 
@@ -882,7 +921,12 @@ def run(self, exe: Executor) -> None:
             source_id = self.rng.randrange(len(self.db.kafka_sources))
             source = self.db.kafka_sources[source_id]
             query = f"DROP SOURCE {source}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             del self.db.kafka_sources[source_id]
 
 
@@ -920,7 +964,12 @@ def run(self, exe: Executor) -> None:
             source_id = self.rng.randrange(len(self.db.postgres_sources))
             source = self.db.postgres_sources[source_id]
             query = f"DROP SOURCE {source.executor.source}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             del self.db.postgres_sources[source_id]
 
 
@@ -964,7 +1013,12 @@ def run(self, exe: Executor) -> None:
             sink_id = self.rng.randrange(len(self.db.kafka_sinks))
             sink = self.db.kafka_sinks[sink_id]
             query = f"DROP SINK {sink}"
-            exe.execute(query)
+            try:
+                exe.execute(query)
+            except QueryError as e:
+                # expected, see #20465
+                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                    raise e
             del self.db.kafka_sinks[sink_id]
 
 

From c7cc60a9eb0bf727967ccdc46df697bc260ca08f Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Mon, 23 Oct 2023 11:34:19 +0000
Subject: [PATCH 03/17] parallel-workload: Run with multiple databases

---
 .../materialize/parallel_workload/action.py   | 555 ++++++++++--------
 .../materialize/parallel_workload/database.py |  67 +--
 .../materialize/parallel_workload/executor.py |  15 +-
 .../parallel_workload/parallel_workload.py    | 125 +++-
 .../materialize/parallel_workload/worker.py   |  60 +-
 test/parallel-workload/mzcompose.py           |   1 +
 6 files changed, 460 insertions(+), 363 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 3fe6e57386fa4..e49161133eddc 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -19,6 +19,7 @@
 from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap
 from materialize.mzcompose.composition import Composition
 from materialize.parallel_workload.database import (
+    DB_OFFSET,
     MAX_CLUSTER_REPLICAS,
     MAX_CLUSTERS,
     MAX_KAFKA_SINKS,
@@ -32,7 +33,6 @@
     MAX_WEBHOOK_SOURCES,
     Cluster,
     ClusterReplica,
-    Database,
     DBObject,
     KafkaSink,
     KafkaSource,
@@ -52,21 +52,19 @@
 # TODO: CASCADE in DROPs, keep track of what will be deleted
 class Action:
     rng: random.Random
-    db: Database
 
-    def __init__(self, rng: random.Random, db: Database):
+    def __init__(self, rng: random.Random):
         self.rng = rng
-        self.db = db
 
     def run(self, exe: Executor) -> None:
         raise NotImplementedError
 
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         result = [
             "permission denied for",
             "must be owner of",
         ]
-        if self.db.complexity == Complexity.DDL:
+        if exe.db.complexity == Complexity.DDL:
             result.extend(
                 [
                     "query could not complete",
@@ -79,13 +77,13 @@ def errors_to_ignore(self) -> list[str]:
                     "the transaction's active cluster has been dropped",  # cluster was dropped
                 ]
             )
-        if self.db.scenario == Scenario.Cancel:
+        if exe.db.scenario == Scenario.Cancel:
             result.extend(
                 [
                     "canceling statement due to user request",
                 ]
             )
-        if self.db.scenario == Scenario.Kill:
+        if exe.db.scenario == Scenario.Kill:
             result.extend(
                 [
                     "network error",
@@ -101,9 +99,9 @@ def errors_to_ignore(self) -> list[str]:
 
 
 class FetchAction(Action):
-    def errors_to_ignore(self) -> list[str]:
-        result = super().errors_to_ignore()
-        if self.db.complexity == Complexity.DDL:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
+        result = super().errors_to_ignore(exe)
+        if exe.db.complexity == Complexity.DDL:
             result.extend(
                 [
                     "does not exist",
@@ -112,7 +110,7 @@ def errors_to_ignore(self) -> list[str]:
         return result
 
     def run(self, exe: Executor) -> None:
-        obj = self.rng.choice(self.db.db_objects())
+        obj = self.rng.choice(exe.db.db_objects())
         # See https://github.com/MaterializeInc/materialize/issues/20474
         exe.rollback() if self.rng.choice([True, False]) else exe.commit()
         query = f"DECLARE c CURSOR FOR SUBSCRIBE {obj}"
@@ -129,15 +127,15 @@ def run(self, exe: Executor) -> None:
 
 
 class SelectAction(Action):
-    def errors_to_ignore(self) -> list[str]:
-        result = super().errors_to_ignore()
-        if self.db.complexity in (Complexity.DML, Complexity.DDL):
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
+        result = super().errors_to_ignore(exe)
+        if exe.db.complexity in (Complexity.DML, Complexity.DDL):
             result.extend(
                 [
                     "in the same timedomain",
                 ]
             )
-        if self.db.complexity == Complexity.DDL:
+        if exe.db.complexity == Complexity.DDL:
             result.extend(
                 [
                     "does not exist",
@@ -146,9 +144,9 @@ def errors_to_ignore(self) -> list[str]:
         return result
 
     def run(self, exe: Executor) -> None:
-        obj = self.rng.choice(self.db.db_objects())
+        obj = self.rng.choice(exe.db.db_objects())
         column = self.rng.choice(obj.columns)
-        obj2 = self.rng.choice(self.db.db_objects())
+        obj2 = self.rng.choice(exe.db.db_objects())
         obj_name = str(obj)
         obj2_name = str(obj2)
         columns = [c for c in obj2.columns if c.data_type == column.data_type]
@@ -197,14 +195,14 @@ class InsertAction(Action):
     def run(self, exe: Executor) -> None:
         table = None
         if exe.insert_table != None:
-            for t in self.db.tables:
+            for t in exe.db.tables:
                 if t.table_id == exe.insert_table:
                     table = t
                     break
             else:
                 exe.commit() if self.rng.choice([True, False]) else exe.rollback()
         if not table:
-            table = self.rng.choice(self.db.tables)
+            table = self.rng.choice(exe.db.tables)
 
         column_names = ", ".join(column.name(True) for column in table.columns)
         column_values = ", ".join(
@@ -215,14 +213,14 @@ def run(self, exe: Executor) -> None:
             return
         exe.execute(query)
         exe.insert_table = table.table_id
-        with self.db.lock:
+        with exe.db.lock:
             table.num_rows += 1
 
 
 class SourceInsertAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            sources = self.db.kafka_sources + self.db.postgres_sources
+        with exe.db.lock:
+            sources = exe.db.kafka_sources + exe.db.postgres_sources
             if not sources:
                 return
             source = self.rng.choice(sources)
@@ -232,20 +230,20 @@ def run(self, exe: Executor) -> None:
 
 
 class UpdateAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "canceling statement due to statement timeout",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
         table = None
         if exe.insert_table != None:
-            for t in self.db.tables:
+            for t in exe.db.tables:
                 if t.table_id == exe.insert_table:
                     table = t
                     break
         if not table:
-            table = self.rng.choice(self.db.tables)
+            table = self.rng.choice(exe.db.tables)
 
         column1 = table.columns[0]
         column2 = self.rng.choice(table.columns)
@@ -259,13 +257,13 @@ def run(self, exe: Executor) -> None:
 
 
 class DeleteAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "canceling statement due to statement timeout",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        table = self.rng.choice(self.db.tables)
+        table = self.rng.choice(exe.db.tables)
         query = f"DELETE FROM {table}"
         if self.rng.random() < 0.95:
             column = self.rng.choice(table.columns)
@@ -282,13 +280,13 @@ def run(self, exe: Executor) -> None:
             # so for now have to trigger them manually here.
             if self.rng.choice([True, False]):
                 exe.commit()
-                with self.db.lock:
+                with exe.db.lock:
                     table.num_rows = 0
 
 
 class CommentAction(Action):
     def run(self, exe: Executor) -> None:
-        table = self.rng.choice(self.db.tables)
+        table = self.rng.choice(exe.db.tables)
 
         if self.rng.choice([True, False]):
             column = self.rng.choice(table.columns)
@@ -300,13 +298,13 @@ def run(self, exe: Executor) -> None:
 
 
 class CreateIndexAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "already exists",  # TODO: Investigate
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
+        tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views]
         table = self.rng.choice(tables_views)
         columns = self.rng.sample(table.columns, len(table.columns))
         columns_str = "_".join(column.name() for column in columns)
@@ -319,83 +317,91 @@ def run(self, exe: Executor) -> None:
         index_str = ", ".join(index_elems)
         query = f"CREATE INDEX {identifier(index_name)} ON {table} ({index_str})"
         exe.execute(query)
-        with self.db.lock:
-            self.db.indexes.add(index_name)
+        with exe.db.lock:
+            exe.db.indexes.add(index_name)
 
 
 class DropIndexAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.indexes:
+        with exe.db.lock:
+            if not exe.db.indexes:
                 return
-            index_name = self.rng.choice(list(self.db.indexes))
+            index_name = self.rng.choice(list(exe.db.indexes))
             query = f"DROP INDEX {identifier(index_name)}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            self.db.indexes.remove(index_name)
+            exe.db.indexes.remove(index_name)
 
 
 class CreateTableAction(Action):
     def run(self, exe: Executor) -> None:
-        if len(self.db.tables) > MAX_TABLES:
+        if len(exe.db.tables) > MAX_TABLES:
             return
-        table_id = self.db.table_id
-        self.db.table_id += 1
-        table = Table(self.rng, table_id, self.rng.choice(self.db.schemas))
+        table_id = exe.db.table_id
+        exe.db.table_id += 1
+        table = Table(self.rng, table_id, self.rng.choice(exe.db.schemas))
         table.create(exe)
-        self.db.tables.append(table)
+        exe.db.tables.append(table)
 
 
 class DropTableAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.tables) <= 2:
+        with exe.db.lock:
+            if len(exe.db.tables) <= 2:
                 return
-            table = self.rng.choice(self.db.tables)
+            table = self.rng.choice(exe.db.tables)
             query = f"DROP TABLE {table}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            self.db.tables.remove(table)
+            exe.db.tables.remove(table)
 
 
 class RenameTableAction(Action):
     def run(self, exe: Executor) -> None:
-        if self.db.scenario != Scenario.Rename:
+        if exe.db.scenario != Scenario.Rename:
             return
-        with self.db.lock:
-            if not self.db.tables:
+        with exe.db.lock:
+            if not exe.db.tables:
                 return
-            table = self.rng.choice(self.db.tables)
-        old_name = str(table)
-        table.rename += 1
-        try:
-            exe.execute(f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}")
-        except:
-            table.rename -= 1
-            raise
+            table = self.rng.choice(exe.db.tables)
+            old_name = str(table)
+            table.rename += 1
+            try:
+                exe.execute(
+                    f"ALTER TABLE {old_name} RENAME TO {identifier(table.name())}"
+                )
+            except:
+                table.rename -= 1
+                raise
 
 
 class RenameViewAction(Action):
     def run(self, exe: Executor) -> None:
-        if self.db.scenario != Scenario.Rename:
+        if exe.db.scenario != Scenario.Rename:
             return
-        with self.db.lock:
-            if not self.db.views:
+        with exe.db.lock:
+            if not exe.db.views:
                 return
-            view = self.rng.choice(self.db.views)
+            view = self.rng.choice(exe.db.views)
             old_name = str(view)
             view.rename += 1
             try:
@@ -409,73 +415,75 @@ def run(self, exe: Executor) -> None:
 
 class RenameSinkAction(Action):
     def run(self, exe: Executor) -> None:
-        if self.db.scenario != Scenario.Rename:
+        if exe.db.scenario != Scenario.Rename:
             return
-        with self.db.lock:
-            if not self.db.kafka_sinks:
+        with exe.db.lock:
+            if not exe.db.kafka_sinks:
                 return
-            sink = self.rng.choice(self.db.kafka_sinks)
-        old_name = str(sink)
-        sink.rename += 1
-        try:
-            exe.execute(f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}")
-        except:
-            sink.rename -= 1
-            raise
+            sink = self.rng.choice(exe.db.kafka_sinks)
+            old_name = str(sink)
+            sink.rename += 1
+            try:
+                exe.execute(
+                    f"ALTER SINK {old_name} RENAME TO {identifier(sink.name())}"
+                )
+            except:
+                sink.rename -= 1
+                raise
 
 
 class CreateSchemaAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.schemas) > MAX_SCHEMAS:
+        with exe.db.lock:
+            if len(exe.db.schemas) > MAX_SCHEMAS:
                 return
-            schema_id = self.db.schema_id
-            self.db.schema_id += 1
+            schema_id = exe.db.schema_id
+            exe.db.schema_id += 1
         schema = Schema(self.rng, schema_id)
         schema.create(exe)
-        self.db.schemas.append(schema)
+        exe.db.schemas.append(schema)
 
 
 class DropSchemaAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "cannot be dropped without CASCADE while it contains objects",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.schemas) <= 1:
+        with exe.db.lock:
+            if len(exe.db.schemas) <= 1:
                 return
-            schema_id = self.rng.randrange(len(self.db.schemas))
-            schema = self.db.schemas[schema_id]
+            schema_id = self.rng.randrange(len(exe.db.schemas))
+            schema = exe.db.schemas[schema_id]
             query = f"DROP SCHEMA {schema}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown schema" not in e.msg:
+                if exe.db.scenario != Scenario.Kill or "unknown schema" not in e.msg:
                     raise e
-            del self.db.schemas[schema_id]
+            del exe.db.schemas[schema_id]
 
 
 class RenameSchemaAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "ambiguous reference to schema named"  # see https://github.com/MaterializeInc/materialize/pull/22551#pullrequestreview-1691876923
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        if self.db.scenario != Scenario.Rename:
+        if exe.db.scenario != Scenario.Rename:
             return
-        with self.db.lock:
-            schema = self.rng.choice(self.db.schemas)
-        old_name = str(schema)
-        schema.rename += 1
-        try:
-            exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}")
-        except:
-            schema.rename -= 1
-            raise
+        with exe.db.lock:
+            schema = self.rng.choice(exe.db.schemas)
+            old_name = str(schema)
+            schema.rename += 1
+            try:
+                exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}")
+            except:
+                schema.rename -= 1
+                raise
 
 
 class SwapSchemaAction(Action):
@@ -518,15 +526,15 @@ def run(self, exe: Executor) -> None:
 
 class CreateViewAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.views) > MAX_VIEWS:
+        with exe.db.lock:
+            if len(exe.db.views) > MAX_VIEWS:
                 return
-            view_id = self.db.view_id
-            self.db.view_id += 1
+            view_id = exe.db.view_id
+            exe.db.view_id += 1
         # Don't use views for now since LIMIT 1 and statement_timeout are
         # not effective yet at preventing long-running queries and OoMs.
-        base_object = self.rng.choice(self.db.db_objects())
-        base_object2: DBObject | None = self.rng.choice(self.db.db_objects())
+        base_object = self.rng.choice(exe.db.db_objects())
+        base_object2: DBObject | None = self.rng.choice(exe.db.db_objects())
         if self.rng.choice([True, False]) or base_object2 == base_object:
             base_object2 = None
         view = View(
@@ -534,24 +542,24 @@ def run(self, exe: Executor) -> None:
             view_id,
             base_object,
             base_object2,
-            self.rng.choice(self.db.schemas),
+            self.rng.choice(exe.db.schemas),
         )
         view.create(exe)
-        self.db.views.append(view)
+        exe.db.views.append(view)
 
 
 class DropViewAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.views:
+        with exe.db.lock:
+            if not exe.db.views:
                 return
-            view_id = self.rng.randrange(len(self.db.views))
-            view = self.db.views[view_id]
+            view_id = self.rng.randrange(len(exe.db.views))
+            view = exe.db.views[view_id]
             if view.materialized:
                 query = f"DROP MATERIALIZED VIEW {view}"
             else:
@@ -560,117 +568,119 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            del self.db.views[view_id]
+            del exe.db.views[view_id]
 
 
 class CreateRoleAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.roles) > MAX_ROLES:
+        with exe.db.lock:
+            if len(exe.db.roles) > MAX_ROLES:
                 return
-            role_id = self.db.role_id
-            self.db.role_id += 1
-        role = Role(role_id)
+            role_id = exe.db.role_id
+            exe.db.role_id += 1
+        role = Role(exe.db.db_id * DB_OFFSET + role_id)
         role.create(exe)
-        self.db.roles.append(role)
+        exe.db.roles.append(role)
 
 
 class DropRoleAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "cannot be dropped because some objects depend on it",
             "current role cannot be dropped",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.roles:
+        with exe.db.lock:
+            if not exe.db.roles:
                 return
-            role_id = self.rng.randrange(len(self.db.roles))
-            role = self.db.roles[role_id]
+            role = self.rng.choice(exe.db.roles)
             query = f"DROP ROLE {role}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown role" not in e.msg:
+                if exe.db.scenario != Scenario.Kill or "unknown role" not in e.msg:
                     raise e
-            del self.db.roles[role_id]
+            exe.db.roles.remove(role)
 
 
 class CreateClusterAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.clusters) > MAX_CLUSTERS:
+        with exe.db.lock:
+            if len(exe.db.clusters) > MAX_CLUSTERS:
                 return
-            cluster_id = self.db.cluster_id
-            self.db.cluster_id += 1
+            cluster_id = exe.db.cluster_id
+            exe.db.cluster_id += 1
         cluster = Cluster(
-            cluster_id,
+            exe.db.db_id * DB_OFFSET + cluster_id,
             managed=self.rng.choice([True, False]),
             size=self.rng.choice(["1", "2", "4"]),
             replication_factor=self.rng.choice([1, 2, 4, 5]),
             introspection_interval=self.rng.choice(["0", "1s", "10s"]),
         )
         cluster.create(exe)
-        self.db.clusters.append(cluster)
+        exe.db.clusters.append(cluster)
 
 
 class DropClusterAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             # cannot drop cluster "..." because other objects depend on it
             "because other objects depend on it",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.clusters) <= 1:
+        with exe.db.lock:
+            if len(exe.db.clusters) <= 1:
                 return
             # Keep cluster 0 with 1 replica for sources/sinks
-            cluster_id = self.rng.randrange(1, len(self.db.clusters))
-            cluster = self.db.clusters[cluster_id]
+            cluster_id = self.rng.randrange(1, len(exe.db.clusters))
+            cluster = exe.db.clusters[cluster_id]
             query = f"DROP CLUSTER {cluster}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg:
+                if exe.db.scenario != Scenario.Kill or "unknown cluster" not in e.msg:
                     raise e
-            del self.db.clusters[cluster_id]
+            del exe.db.clusters[cluster_id]
 
 
 class SetClusterAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "SET cluster cannot be called in an active transaction",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.clusters:
+        with exe.db.lock:
+            if not exe.db.clusters:
                 return
-            cluster = self.rng.choice(self.db.clusters)
+            cluster = self.rng.choice(exe.db.clusters)
         query = f"SET CLUSTER = {cluster}"
         exe.execute(query)
 
 
 class CreateClusterReplicaAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         result = [
             "cannot create more than one replica of a cluster containing sources or sinks",
             # Can happen with reduced locking
             "cannot create multiple replicas named",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
         return result
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
+        with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
-            unmanaged_clusters = [c for c in self.db.clusters[1:] if not c.managed]
+            unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
             if not unmanaged_clusters:
                 return
             cluster = self.rng.choice(unmanaged_clusters)
@@ -692,9 +702,9 @@ def run(self, exe: Executor) -> None:
 
 class DropClusterReplicaAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
+        with exe.db.lock:
             # Keep cluster 0 with 1 replica for sources/sinks
-            unmanaged_clusters = [c for c in self.db.clusters[1:] if not c.managed]
+            unmanaged_clusters = [c for c in exe.db.clusters[1:] if not c.managed]
             if not unmanaged_clusters:
                 return
             cluster = self.rng.choice(unmanaged_clusters)
@@ -707,18 +717,22 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "has no CLUSTER REPLICA named" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "has no CLUSTER REPLICA named" not in e.msg
+                ):
                     raise e
             cluster.replicas.remove(replica)
 
+
 class GrantPrivilegesAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.roles:
+        with exe.db.lock:
+            if not exe.db.roles:
                 return
-            role = self.rng.choice(self.db.roles)
+            role = self.rng.choice(exe.db.roles)
         privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
-        tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
+        tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views]
         table = self.rng.choice(tables_views)
         query = f"GRANT {privilege} ON {table} TO {role}"
         exe.execute(query)
@@ -726,12 +740,12 @@ def run(self, exe: Executor) -> None:
 
 class RevokePrivilegesAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.roles:
+        with exe.db.lock:
+            if not exe.db.roles:
                 return
-            role = self.rng.choice(self.db.roles)
+            role = self.rng.choice(exe.db.roles)
         privilege = self.rng.choice(["SELECT", "INSERT", "UPDATE", "ALL"])
-        tables_views: list[DBObject] = [*self.db.tables, *self.db.views]
+        tables_views: list[DBObject] = [*exe.db.tables, *exe.db.views]
         table = self.rng.choice(tables_views)
         query = f"REVOKE {privilege} ON {table} FROM {role}"
         exe.execute(query)
@@ -742,20 +756,19 @@ class ReconnectAction(Action):
     def __init__(
         self,
         rng: random.Random,
-        db: Database,
         random_role: bool = True,
     ):
-        super().__init__(rng, db)
+        super().__init__(rng)
         self.random_role = random_role
 
     def run(self, exe: Executor) -> None:
         autocommit = exe.cur._c.autocommit
-        host = self.db.host
-        port = self.db.ports["materialized"]
-        with self.db.lock:
-            if self.random_role and self.db.roles:
+        host = exe.db.host
+        port = exe.db.ports["materialized"]
+        with exe.db.lock:
+            if self.random_role and exe.db.roles:
                 user = self.rng.choice(
-                    ["materialize", str(self.rng.choice(self.db.roles))]
+                    ["materialize", str(self.rng.choice(exe.db.roles))]
                 )
             else:
                 user = "materialize"
@@ -774,7 +787,7 @@ def run(self, exe: Executor) -> None:
         for i in range(NUM_ATTEMPTS):
             try:
                 conn = pg8000.connect(
-                    host=host, port=port, user=user, database=self.db.name()
+                    host=host, port=port, user=user, database=exe.db.name()
                 )
                 conn.autocommit = autocommit
                 cur = conn.cursor()
@@ -798,30 +811,29 @@ def run(self, exe: Executor) -> None:
 class CancelAction(Action):
     workers: list["Worker"]
 
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "must be a member of",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def __init__(
         self,
         rng: random.Random,
-        db: Database,
         workers: list["Worker"],
     ):
-        super().__init__(rng, db)
+        super().__init__(rng)
         self.workers = workers
 
     def run(self, exe: Executor) -> None:
         pid = self.rng.choice(
-            [worker.exe.pg_pid for worker in self.workers if worker.exe and worker.exe.pg_pid != -1]  # type: ignore
+            [exe.pg_pid for worker in self.workers for exe in worker.exes if exe and exe.pg_pid != -1]  # type: ignore
         )
         worker = None
         for i in range(len(self.workers)):
-            worker_exe = self.workers[i].exe
-            if worker_exe and worker_exe.pg_pid == pid:
-                worker = f"worker_{i}"
-                break
+            for worker_exe in self.workers[i].exes:
+                if worker_exe and worker_exe.pg_pid == pid:
+                    worker = f"worker_{i}"
+                    break
         assert worker
         exe.execute(
             f"SELECT pg_cancel_backend({pid})", extra_info=f"Canceling {worker}"
@@ -836,10 +848,9 @@ class KillAction(Action):
     def __init__(
         self,
         rng: random.Random,
-        db: Database,
         composition: Composition,
     ):
-        super().__init__(rng, db)
+        super().__init__(rng)
         self.composition = composition
 
     def run(self, exe: Executor) -> None:
@@ -852,184 +863,206 @@ def run(self, exe: Executor) -> None:
 
 class CreateWebhookSourceAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.webhook_sources) > MAX_WEBHOOK_SOURCES:
+        with exe.db.lock:
+            if len(exe.db.webhook_sources) > MAX_WEBHOOK_SOURCES:
                 return
-            webhook_source_id = self.db.webhook_source_id
-            self.db.webhook_source_id += 1
-        potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
+            webhook_source_id = exe.db.webhook_source_id
+            exe.db.webhook_source_id += 1
+        potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
         cluster = self.rng.choice(potential_clusters)
-        schema = self.rng.choice(self.db.schemas)
+        schema = self.rng.choice(exe.db.schemas)
         source = WebhookSource(webhook_source_id, cluster, schema, self.rng)
         source.create(exe)
-        self.db.webhook_sources.append(source)
+        exe.db.webhook_sources.append(source)
 
 
 class DropWebhookSourceAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.webhook_sources:
+        with exe.db.lock:
+            if not exe.db.webhook_sources:
                 return
-            source_id = self.rng.randrange(len(self.db.webhook_sources))
-            source = self.db.webhook_sources[source_id]
+            source_id = self.rng.randrange(len(exe.db.webhook_sources))
+            source = exe.db.webhook_sources[source_id]
             query = f"DROP SOURCE {source}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            del self.db.webhook_sources[source_id]
+            del exe.db.webhook_sources[source_id]
 
 
 class CreateKafkaSourceAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.kafka_sources) > MAX_KAFKA_SOURCES:
+        with exe.db.lock:
+            if len(exe.db.kafka_sources) > MAX_KAFKA_SOURCES:
                 return
-            source_id = self.db.kafka_source_id
-            self.db.kafka_source_id += 1
-        potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
+            source_id = exe.db.kafka_source_id
+            exe.db.kafka_source_id += 1
+        potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
         cluster = self.rng.choice(potential_clusters)
-        schema = self.rng.choice(self.db.schemas)
+        schema = self.rng.choice(exe.db.schemas)
         try:
             source = KafkaSource(
-                self.db.name(), source_id, cluster, schema, self.db.ports, self.rng
+                exe.db.name(),
+                exe.db.db_id * DB_OFFSET + source_id,
+                cluster,
+                schema,
+                exe.db.ports,
+                self.rng,
             )
             source.create(exe)
-            self.db.kafka_sources.append(source)
+            exe.db.kafka_sources.append(source)
         except:
-            if self.db.scenario != Scenario.Kill:
+            if exe.db.scenario != Scenario.Kill:
                 raise
 
 
 class DropKafkaSourceAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.kafka_sources:
+        with exe.db.lock:
+            if not exe.db.kafka_sources:
                 return
-            source_id = self.rng.randrange(len(self.db.kafka_sources))
-            source = self.db.kafka_sources[source_id]
+            source_id = self.rng.randrange(len(exe.db.kafka_sources))
+            source = exe.db.kafka_sources[source_id]
             query = f"DROP SOURCE {source}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            del self.db.kafka_sources[source_id]
+            del exe.db.kafka_sources[source_id]
 
 
 class CreatePostgresSourceAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.postgres_sources) > MAX_POSTGRES_SOURCES:
+        with exe.db.lock:
+            if len(exe.db.postgres_sources) > MAX_POSTGRES_SOURCES:
                 return
-            source_id = self.db.postgres_source_id
-            self.db.postgres_source_id += 1
-            potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
-            schema = self.rng.choice(self.db.schemas)
+            source_id = exe.db.postgres_source_id
+            exe.db.postgres_source_id += 1
+            potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
+            schema = self.rng.choice(exe.db.schemas)
             cluster = self.rng.choice(potential_clusters)
         try:
             source = PostgresSource(
-                self.db.name(), source_id, cluster, schema, self.db.ports, self.rng
+                exe.db.name(),
+                exe.db.db_id * DB_OFFSET + source_id,
+                cluster,
+                schema,
+                exe.db.ports,
+                self.rng,
             )
             source.create(exe)
-            self.db.postgres_sources.append(source)
+            exe.db.postgres_sources.append(source)
         except:
-            if self.db.scenario != Scenario.Kill:
+            if exe.db.scenario != Scenario.Kill:
                 raise
 
 
 class DropPostgresSourceAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.postgres_sources:
+        with exe.db.lock:
+            if not exe.db.postgres_sources:
                 return
-            source_id = self.rng.randrange(len(self.db.postgres_sources))
-            source = self.db.postgres_sources[source_id]
+            source_id = self.rng.randrange(len(exe.db.postgres_sources))
+            source = exe.db.postgres_sources[source_id]
             query = f"DROP SOURCE {source.executor.source}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            del self.db.postgres_sources[source_id]
+            del exe.db.postgres_sources[source_id]
 
 
 class CreateKafkaSinkAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             # Another replica can be created in parallel
             "cannot create sink in cluster with more than one replica",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if len(self.db.kafka_sinks) > MAX_KAFKA_SINKS:
+        with exe.db.lock:
+            if len(exe.db.kafka_sinks) > MAX_KAFKA_SINKS:
                 return
-            sink_id = self.db.kafka_sink_id
-            self.db.kafka_sink_id += 1
-            potential_clusters = [c for c in self.db.clusters if len(c.replicas) == 1]
+            sink_id = exe.db.kafka_sink_id
+            exe.db.kafka_sink_id += 1
+            potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
             cluster = self.rng.choice(potential_clusters)
-            schema = self.rng.choice(self.db.schemas)
+            schema = self.rng.choice(exe.db.schemas)
         sink = KafkaSink(
-            sink_id,
+            exe.db.db_id * DB_OFFSET + sink_id,
             cluster,
             schema,
-            self.rng.choice(self.db.db_objects_without_views()),
+            self.rng.choice(exe.db.db_objects_without_views()),
             self.rng,
         )
         sink.create(exe)
-        self.db.kafka_sinks.append(sink)
+        exe.db.kafka_sinks.append(sink)
 
 
 class DropKafkaSinkAction(Action):
-    def errors_to_ignore(self) -> list[str]:
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
         return [
             "still depended upon by",
-        ] + super().errors_to_ignore()
+        ] + super().errors_to_ignore(exe)
 
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.kafka_sinks:
+        with exe.db.lock:
+            if not exe.db.kafka_sinks:
                 return
-            sink_id = self.rng.randrange(len(self.db.kafka_sinks))
-            sink = self.db.kafka_sinks[sink_id]
+            sink_id = self.rng.randrange(len(exe.db.kafka_sinks))
+            sink = exe.db.kafka_sinks[sink_id]
             query = f"DROP SINK {sink}"
             try:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if self.db.scenario != Scenario.Kill or "unknown catalog item" not in e.msg:
+                if (
+                    exe.db.scenario != Scenario.Kill
+                    or "unknown catalog item" not in e.msg
+                ):
                     raise e
-            del self.db.kafka_sinks[sink_id]
+            del exe.db.kafka_sinks[sink_id]
 
 
 class HttpPostAction(Action):
     def run(self, exe: Executor) -> None:
-        with self.db.lock:
-            if not self.db.webhook_sources:
+        with exe.db.lock:
+            if not exe.db.webhook_sources:
                 return
 
-            source = self.rng.choice(self.db.webhook_sources)
-        url = f"http://{self.db.host}:{self.db.ports['http']}/api/webhook/{self.db}/public/{source}"
+            source = self.rng.choice(exe.db.webhook_sources)
+        url = f"http://{exe.db.host}:{exe.db.ports['http']}/api/webhook/{exe.db}/public/{source}"
 
         payload = source.body_format.to_data_type().random_value(self.rng)
 
@@ -1052,7 +1085,7 @@ def run(self, exe: Executor) -> None:
             requests.post(url, data=payload.encode("utf-8"), headers=headers)
         except requests.exceptions.ConnectionError:
             # Expeceted when Mz is killed
-            if self.db.scenario != Scenario.Kill:
+            if exe.db.scenario != Scenario.Kill:
                 raise
 
 
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index e3e5fb8853548..3f6a39bb5e7a7 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -33,6 +33,7 @@
 from materialize.util import naughty_strings
 
 MAX_COLUMNS = 100
+MAX_INCLUDE_HEADERS = 5
 MAX_ROWS = 1000
 MAX_CLUSTERS = 10
 MAX_CLUSTER_REPLICAS = 4
@@ -44,7 +45,6 @@
 MAX_KAFKA_SOURCES = 20
 MAX_POSTGRES_SOURCES = 20
 MAX_KAFKA_SINKS = 20
-MAX_INCLUDE_HEADERS = 5
 
 MAX_INITIAL_SCHEMAS = 1
 MAX_INITIAL_CLUSTERS = 2
@@ -56,7 +56,7 @@
 MAX_INITIAL_POSTGRES_SOURCES = 3
 MAX_INITIAL_KAFKA_SINKS = 3
 
-MAX_IDENTIFIER_LENGTH = 255
+DB_OFFSET = 1_000_000
 
 NAUGHTY_IDENTIFIERS = False
 
@@ -654,6 +654,7 @@ def create(self, exe: Executor) -> None:
 
 
 class Database:
+    db_id: int
     seed: str
     complexity: Complexity
     scenario: Scenario
@@ -682,6 +683,7 @@ class Database:
 
     def __init__(
         self,
+        db_id: int,
         rng: random.Random,
         seed: str,
         host: str,
@@ -691,6 +693,7 @@ def __init__(
         naughty_identifiers: bool,
     ):
         global NAUGHTY_IDENTIFIERS
+        self.db_id = db_id
         self.seed = seed
         self.host = host
         self.ports = ports
@@ -718,12 +721,15 @@ def __init__(
             view = View(rng, i, base_object, base_object2, rng.choice(self.schemas))
             self.views.append(view)
         self.view_id = len(self.views)
-        self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))]
+        self.roles = [
+            Role(db_id * DB_OFFSET + i)
+            for i in range(rng.randint(0, MAX_INITIAL_ROLES))
+        ]
         self.role_id = len(self.roles)
         # At least one storage cluster required for WebhookSources
         self.clusters = [
             Cluster(
-                i,
+                self.db_id * DB_OFFSET + i,
                 managed=rng.choice([True, False]),
                 size=rng.choice(["1", "2", "4"]),
                 replication_factor=1,
@@ -741,7 +747,7 @@ def __init__(
         self.kafka_sources = [
             KafkaSource(
                 self.name(),
-                i,
+                self.db_id * DB_OFFSET + i,
                 rng.choice(self.clusters),
                 rng.choice(self.schemas),
                 ports,
@@ -753,7 +759,7 @@ def __init__(
         self.postgres_sources = [
             PostgresSource(
                 self.name(),
-                i,
+                self.db_id * DB_OFFSET + i,
                 rng.choice(self.clusters),
                 rng.choice(self.schemas),
                 ports,
@@ -776,7 +782,7 @@ def __init__(
         self.lock = threading.Lock()
 
     def name(self) -> str:
-        return naughtify(f"db-pw-{self.seed}")
+        return naughtify(f"db-pw-{self.seed}-{self.db_id}")
 
     def __str__(self) -> str:
         return identifier(self.name())
@@ -810,48 +816,19 @@ def drop(self, exe: Executor) -> None:
 
     def create(self, exe: Executor) -> None:
         self.drop(exe)
-        exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
-        exe.execute("ALTER SYSTEM SET max_schemas_per_database = 105")
-        # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
-        exe.execute("ALTER SYSTEM SET max_tables = 200")
-        exe.execute("ALTER SYSTEM SET max_materialized_views = 105")
-        exe.execute("ALTER SYSTEM SET max_sources = 105")
-        exe.execute("ALTER SYSTEM SET max_roles = 105")
-        exe.execute("ALTER SYSTEM SET max_clusters = 105")
-        exe.execute("ALTER SYSTEM SET max_replicas_per_cluster = 105")
-        # Most queries should not fail because of privileges
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC"
-        )
-        exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC"
-        )
         exe.execute(f"CREATE DATABASE {self}")
         exe.execute(f"ALTER DATABASE {self} OWNER TO materialize")
 
     def create_relations(self, exe: Executor) -> None:
-        exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
-        for row in exe.cur.fetchall():
-            exe.execute(f"DROP CLUSTER {row[0]} CASCADE")
-
-        exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
-        for row in exe.cur.fetchall():
-            exe.execute(f"DROP ROLE {row[0]}")
+        # Roles and clusters are system wide, not per DB
+        if self.db_id == 0:
+            exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
+            for row in exe.cur.fetchall():
+                exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE")
+
+            exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
+            for row in exe.cur.fetchall():
+                exe.execute(f"DROP ROLE {identifier(row[0])}")
 
         exe.execute("CREATE CONNECTION kafka_conn FOR KAFKA BROKER 'kafka:9092'")
         exe.execute(
diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py
index 592111fbf0cf2..27cd1f4613b73 100644
--- a/misc/python/materialize/parallel_workload/executor.py
+++ b/misc/python/materialize/parallel_workload/executor.py
@@ -9,10 +9,13 @@
 
 import random
 import threading
-from typing import TextIO
+from typing import TYPE_CHECKING, TextIO
 
 import pg8000
 
+if TYPE_CHECKING:
+    from materialize.parallel_workload.database import Database
+
 logging: TextIO | None
 lock: threading.Lock
 
@@ -38,12 +41,18 @@ class Executor:
     pg_pid: int
     # Used by INSERT action to prevent writing into different tables in the same transaction
     insert_table: int | None
+    db: "Database"
+    reconnect_next: bool
+    rollback_next: bool
 
-    def __init__(self, rng: random.Random, cur: pg8000.Cursor):
+    def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"):
         self.rng = rng
         self.cur = cur
+        self.db = db
         self.pg_pid = -1
         self.insert_table = None
+        self.reconnect_next = True
+        self.rollback_next = True
 
     def set_isolation(self, level: str) -> None:
         self.execute(f"SET TRANSACTION_ISOLATION TO '{level}'")
@@ -71,7 +80,7 @@ def log(self, msg: str) -> None:
         thread_name = threading.current_thread().getName()
 
         with lock:
-            print(f"[{thread_name}] {msg}", file=logging)
+            print(f"[{thread_name}][{self.db.name()}] {msg}", file=logging)
             logging.flush()
 
     def execute(
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 0895e107aaa23..35d99b0cffa9f 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -30,7 +30,19 @@
     read_action_list,
     write_action_list,
 )
-from materialize.parallel_workload.database import Database
+from materialize.parallel_workload.database import (
+    MAX_CLUSTER_REPLICAS,
+    MAX_CLUSTERS,
+    MAX_KAFKA_SINKS,
+    MAX_KAFKA_SOURCES,
+    MAX_POSTGRES_SOURCES,
+    MAX_ROLES,
+    MAX_SCHEMAS,
+    MAX_TABLES,
+    MAX_VIEWS,
+    MAX_WEBHOOK_SOURCES,
+    Database,
+)
 from materialize.parallel_workload.executor import Executor, initialize_logging
 from materialize.parallel_workload.settings import Complexity, Scenario
 from materialize.parallel_workload.worker import Worker
@@ -48,13 +60,14 @@ def run(
     scenario: Scenario,
     num_threads: int | None,
     naughty_identifiers: bool,
+    num_databases: int,
     composition: Composition | None,
 ) -> None:
     num_threads = num_threads or os.cpu_count() or 10
     random.seed(seed)
 
     print(
-        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} (--host={host})"
+        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} --databases={num_databases} (--host={host})"
     )
     initialize_logging()
 
@@ -63,28 +76,79 @@ def run(
     ).timestamp()
 
     rng = random.Random(random.randrange(SEED_RANGE))
-    database = Database(
-        rng, seed, host, ports, complexity, scenario, naughty_identifiers
-    )
+    databases = [
+        Database(i, rng, seed, host, ports, complexity, scenario, naughty_identifiers)
+        for i in range(num_databases)
+    ]
 
     system_conn = pg8000.connect(
         host=host, port=ports["mz_system"], user="mz_system", database="materialize"
     )
     system_conn.autocommit = True
-    with system_conn.cursor() as cur:
-        database.create(Executor(rng, cur))
-    system_conn.close()
+    with system_conn.cursor() as system_cur:
+        system_exe = Executor(rng, system_cur, databases[0])
+        system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS + 5}"
+        )
+        # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS + 5}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS + 5}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES + 5}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS + 5}"
+        )
+        system_exe.execute(
+            f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS + 5}"
+        )
+        # Most queries should not fail because of privileges
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC"
+        )
+        system_exe.execute(
+            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC"
+        )
+        for database in databases:
+            database.create(system_exe)
 
-    conn = pg8000.connect(
-        host=host,
-        port=ports["materialized"],
-        user="materialize",
-        database=database.name(),
-    )
-    conn.autocommit = True
-    with conn.cursor() as cur:
-        database.create_relations(Executor(rng, cur))
-    conn.close()
+            conn = pg8000.connect(
+                host=host,
+                port=ports["materialized"],
+                user="materialize",
+                database=database.name(),
+            )
+            conn.autocommit = True
+            with conn.cursor() as cur:
+                database.create_relations(Executor(rng, cur, database))
+            conn.close()
+    system_conn.close()
 
     workers = []
     threads = []
@@ -110,8 +174,7 @@ def run(
             weights,
         )[0]
         actions = [
-            action_class(worker_rng, database)
-            for action_class in action_list.action_classes
+            action_class(worker_rng) for action_class in action_list.action_classes
         ]
         worker = Worker(
             worker_rng,
@@ -130,7 +193,7 @@ def run(
         thread = threading.Thread(
             name=thread_name,
             target=worker.run,
-            args=(host, ports["materialized"], "materialize", database.name()),
+            args=(host, ports["materialized"], "materialize", databases),
         )
         thread.start()
         threads.append(thread)
@@ -138,7 +201,7 @@ def run(
     if scenario == Scenario.Cancel:
         worker = Worker(
             worker_rng,
-            [CancelAction(worker_rng, database, workers)],
+            [CancelAction(worker_rng, workers)],
             [1],
             end_time,
             autocommit=False,
@@ -148,7 +211,7 @@ def run(
         thread = threading.Thread(
             name="cancel",
             target=worker.run,
-            args=(host, ports["mz_system"], "mz_system", str(database)),
+            args=(host, ports["mz_system"], "mz_system", databases),
         )
         thread.start()
         threads.append(thread)
@@ -156,7 +219,7 @@ def run(
         assert composition, "Kill scenario only works in mzcompose"
         worker = Worker(
             worker_rng,
-            [KillAction(worker_rng, database, composition)],
+            [KillAction(worker_rng, composition)],
             [1],
             end_time,
             autocommit=False,
@@ -166,7 +229,7 @@ def run(
         thread = threading.Thread(
             name="kill",
             target=worker.run,
-            args=(host, ports["materialized"], "materialize", str(database)),
+            args=(host, ports["materialized"], "materialize", databases),
         )
         thread.start()
         threads.append(thread)
@@ -204,8 +267,9 @@ def run(
     conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
     conn.autocommit = True
     with conn.cursor() as cur:
-        print(f"Dropping database {database}")
-        database.drop(Executor(rng, cur))
+        for database in databases:
+            print(f"Dropping database {database}")
+            database.drop(Executor(rng, cur, database))
     conn.close()
 
     ignored_errors: defaultdict[str, Counter[type[Action]]] = defaultdict(Counter)
@@ -253,6 +317,12 @@ def parse_common_args(parser: argparse.ArgumentParser) -> None:
         action="store_true",
         help="Whether to use naughty strings as identifiers, makes the queries unreadable",
     )
+    parser.add_argument(
+        "--databases",
+        default=2,
+        type=int,
+        help="Number of databases to create and run against, 2 by default",
+    )
 
 
 def main() -> int:
@@ -301,6 +371,7 @@ def main() -> int:
         Scenario(args.scenario),
         args.threads,
         args.naughty_identifiers,
+        args.databases,
         composition=None,  # only works in mzcompose
     )
     return 0
diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py
index 2114800f20352..167683db97777 100644
--- a/misc/python/materialize/parallel_workload/worker.py
+++ b/misc/python/materialize/parallel_workload/worker.py
@@ -15,6 +15,7 @@
 import pg8000
 
 from materialize.parallel_workload.action import Action, ReconnectAction
+from materialize.parallel_workload.database import Database
 from materialize.parallel_workload.executor import Executor, QueryError
 
 
@@ -26,7 +27,7 @@ class Worker:
     num_queries: int
     autocommit: bool
     system: bool
-    exe: Executor | None
+    exes: list[Executor]
     ignored_errors: defaultdict[str, Counter[type[Action]]]
 
     def __init__(
@@ -46,25 +47,30 @@ def __init__(
         self.autocommit = autocommit
         self.system = system
         self.ignored_errors = defaultdict(Counter)
-        self.exe = None
+        self.exes = []
+
+    def run(self, host: str, port: int, user: str, databases: list[Database]) -> None:
+        self.conns = [
+            pg8000.connect(host=host, port=port, user=user, database=database.name())
+            for database in databases
+        ]
+        for database, conn in zip(databases, self.conns):
+            conn.autocommit = self.autocommit
+            cur = conn.cursor()
+            exe = Executor(self.rng, cur, database)
+            exe.set_isolation("SERIALIZABLE")
+            cur.execute("SELECT pg_backend_pid()")
+            exe.pg_pid = cur.fetchall()[0][0]
+            self.exes.append(exe)
 
-    def run(self, host: str, port: int, user: str, database: str) -> None:
-        self.conn = pg8000.connect(host=host, port=port, user=user, database=database)
-        self.conn.autocommit = self.autocommit
-        cur = self.conn.cursor()
-        self.exe = Executor(self.rng, cur)
-        self.exe.set_isolation("SERIALIZABLE")
-        cur.execute("SELECT pg_backend_pid()")
-        self.exe.pg_pid = cur.fetchall()[0][0]
-        rollback_next = True
-        reconnect_next = True
         while time.time() < self.end_time:
+            exe = self.rng.choice(self.exes)
             action = self.rng.choices(self.actions, self.weights)[0]
             self.num_queries += 1
             try:
-                if rollback_next:
+                if exe.rollback_next:
                     try:
-                        self.exe.rollback()
+                        exe.rollback()
                     except QueryError as e:
                         if (
                             "Please disconnect and re-connect" in e.msg
@@ -72,18 +78,16 @@ def run(self, host: str, port: int, user: str, database: str) -> None:
                             or "Can't create a connection to host" in e.msg
                             or "Connection refused" in e.msg
                         ):
-                            reconnect_next = True
-                            rollback_next = False
+                            exe.reconnect_next = True
+                            exe.rollback_next = False
                             continue
-                    rollback_next = False
-                if reconnect_next:
-                    ReconnectAction(self.rng, action.db, random_role=False).run(
-                        self.exe
-                    )
-                    reconnect_next = False
-                action.run(self.exe)
+                    exe.rollback_next = False
+                if exe.reconnect_next:
+                    ReconnectAction(self.rng, random_role=False).run(exe)
+                    exe.reconnect_next = False
+                action.run(exe)
             except QueryError as e:
-                for error in action.errors_to_ignore():
+                for error in action.errors_to_ignore(exe):
                     if error in e.msg:
                         self.ignored_errors[error][type(action)] += 1
                         if (
@@ -92,11 +96,13 @@ def run(self, host: str, port: int, user: str, database: str) -> None:
                             or "Can't create a connection to host" in e.msg
                             or "Connection refused" in e.msg
                         ):
-                            reconnect_next = True
+                            exe.reconnect_next = True
                         else:
-                            rollback_next = True
+                            exe.rollback_next = True
                         break
                 else:
                     thread_name = threading.current_thread().getName()
-                    print(f"{thread_name} Query failed: {e.query} {e.msg}")
+                    print(
+                        f"[{thread_name}][{exe.db.name()}] Query failed: {e.query} {e.msg}"
+                    )
                     raise
diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py
index bfc0a4a562e4e..13c89a7b45007 100644
--- a/test/parallel-workload/mzcompose.py
+++ b/test/parallel-workload/mzcompose.py
@@ -68,6 +68,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
         Scenario(args.scenario),
         args.threads,
         args.naughty_identifiers,
+        args.databases,
         c,
     )
     # TODO: Only ignore errors that will be handled by parallel-workload, not others

From a19593da6ef814d019f13d43ccec9fbc3cf8b75e Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Mon, 23 Oct 2023 17:09:30 +0000
Subject: [PATCH 04/17] parallel-workload: Workaround for 21954

---
 ci/nightly/pipeline.template.yml              | 16 +++++-----
 .../materialize/parallel_workload/action.py   | 19 ++++++++++++
 .../parallel_workload/parallel_workload.py    | 30 +++++++++++++++----
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index ba7813703d038..eeb7750f6866d 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -797,7 +797,7 @@ steps:
       - id: parallel-workload-dml
         label: "Parallel Workload (DML)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -808,7 +808,7 @@ steps:
       - id: parallel-workload-ddl
         label: "Parallel Workload (DDL)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -819,7 +819,7 @@ steps:
       - id: parallel-workload-100-threads
         label: "Parallel Workload (100 threads)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -831,7 +831,7 @@ steps:
       - id: parallel-workload-rename-naughty
         label: "Parallel Workload (rename + naughty identifiers)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -842,7 +842,7 @@ steps:
       - id: parallel-workload-rename
         label: "Parallel Workload (rename)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -853,7 +853,7 @@ steps:
       - id: parallel-workload-cancel
         label: "Parallel Workload (cancel)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -864,7 +864,7 @@ steps:
       - id: parallel-workload-kill
         label: "Parallel Workload (kill)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
@@ -875,7 +875,7 @@ steps:
       - id: parallel-workload-backup-restore
         label: "Parallel Workload (backup & restore)"
         artifact_paths: [junit_*.xml, parallel-workload-queries.log.zst]
-        timeout_in_minutes: 40
+        timeout_in_minutes: 60
         agents:
           queue: builder-linux-x86_64
         plugins:
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index e49161133eddc..045b745a642cb 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -63,6 +63,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]:
         result = [
             "permission denied for",
             "must be owner of",
+            "network error",  # #21954, remove when fixed when fixed
         ]
         if exe.db.complexity == Complexity.DDL:
             result.extend(
@@ -861,6 +862,24 @@ def run(self, exe: Executor) -> None:
         time.sleep(self.rng.uniform(20, 180))
 
 
+class BackupRestoreAction(Action):
+    composition: Composition
+    exes: list[Executor]
+
+    def __init__(
+        self,
+        rng: random.Random,
+        composition: Composition,
+        exes: list[Executor]) -> None:
+        super().__init__(rng)
+        self.composition = composition
+        self.exes = exes
+
+    def run(self, exe: Executor) -> None:
+        time.sleep(self.rng.uniform(10, 120))
+        # TODO: Backup & restore here
+
+
 class CreateWebhookSourceAction(Action):
     def run(self, exe: Executor) -> None:
         with exe.db.lock:
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 35d99b0cffa9f..8734a5b77da64 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -89,29 +89,29 @@ def run(
         system_exe = Executor(rng, system_cur, databases[0])
         system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
         system_exe.execute(
-            f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS + 5}"
+            f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 2}"
         )
         # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
         system_exe.execute(
             f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}"
         )
         system_exe.execute(
-            f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS + 5}"
+            f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS * 2}"
         )
         system_exe.execute(
             f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}"
         )
         system_exe.execute(
-            f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS + 5}"
+            f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS * 2}"
         )
         system_exe.execute(
-            f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES + 5}"
+            f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES * 2}"
         )
         system_exe.execute(
-            f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS + 5}"
+            f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS * 2}"
         )
         system_exe.execute(
-            f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS + 5}"
+            f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}"
         )
         # Most queries should not fail because of privileges
         system_exe.execute(
@@ -233,6 +233,24 @@ def run(
         )
         thread.start()
         threads.append(thread)
+    elif scenario == Scenario.BackupRestore:
+        assert composition, "Backup & Restore scenario only works in mzcompose"
+        worker = Worker(
+            worker_rng,
+            [BackupRestoreAction(worker_rng, composition, exes)],
+            [1],
+            end_time,
+            autocommit=False,
+            system=False,
+        )
+        workers.append(worker)
+        thread = threading.Thread(
+            name="kill",
+            target=worker.run,
+            args=(host, ports["materialized"], "materialize", databases),
+        )
+        thread.start()
+        threads.append(thread)
     elif scenario in (Scenario.Regression, Scenario.Rename):
         pass
     else:

From 5a549edeee06ac09790c72d05c6cd6ebca610abb Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 25 Oct 2023 13:07:29 +0000
Subject: [PATCH 05/17] parallel-workload: Disable Set cluster in transactions

---
 misc/python/materialize/parallel_workload/action.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 045b745a642cb..29636a41a8d2b 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -92,7 +92,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]:
                     "Connection refused",
                 ]
             )
-        if self.db.scenario == Scenario.Rename:
+        if exe.db.scenario == Scenario.Rename:
             result.extend(["unknown schema", "ambiguous reference to schema name"])
         if materialize.parallel_workload.database.NAUGHTY_IDENTIFIERS:
             result.extend(["identifier length exceeds 255 bytes"])
@@ -1124,7 +1124,7 @@ def __init__(
 read_action_list = ActionList(
     [
         (SelectAction, 100),
-        (SetClusterAction, 1),
+        # (SetClusterAction, 1),  # SET cluster cannot be called in an active transaction
         (CommitRollbackAction, 1),
         (ReconnectAction, 1),
     ],
@@ -1134,7 +1134,7 @@ def __init__(
 fetch_action_list = ActionList(
     [
         (FetchAction, 30),
-        (SetClusterAction, 1),
+        # (SetClusterAction, 1),  # SET cluster cannot be called in an active transaction
         (ReconnectAction, 1),
     ],
     autocommit=False,
@@ -1143,7 +1143,7 @@ def __init__(
 write_action_list = ActionList(
     [
         (InsertAction, 100),
-        (SetClusterAction, 1),
+        # (SetClusterAction, 1),  # SET cluster cannot be called in an active transaction
         (HttpPostAction, 50),
         (CommitRollbackAction, 1),
         (ReconnectAction, 1),

From 47a8e368a84d079d7b29c14b60bc3d1d4ec12c69 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 25 Oct 2023 15:39:50 +0000
Subject: [PATCH 06/17] parallel-workload: Add backup&restore scenario

---
 ci/nightly/pipeline.template.yml              |  2 +-
 .../materialize/parallel_workload/action.py   | 67 ++++++++++++++++---
 .../parallel_workload/parallel_workload.py    |  3 +-
 test/parallel-workload/mzcompose.py           | 22 ++++++
 4 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index eeb7750f6866d..4b418ff7b9552 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -881,7 +881,7 @@ steps:
         plugins:
           - ./ci/plugins/mzcompose:
               composition: parallel-workload
-              args: [--runtime=1500, --scenario=backup-restore]
+              args: [--runtime=1500, --scenario=backup-restore, --naughty-identifiers]
 
   - id: incident-70
     label: "Test for incident 70"
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 29636a41a8d2b..928f07ce6209b 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -18,6 +18,7 @@
 import materialize.parallel_workload.database
 from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap
 from materialize.mzcompose.composition import Composition
+from materialize.mzcompose.services.minio import MINIO_BLOB_URI
 from materialize.parallel_workload.database import (
     DB_OFFSET,
     MAX_CLUSTER_REPLICAS,
@@ -33,6 +34,7 @@
     MAX_WEBHOOK_SOURCES,
     Cluster,
     ClusterReplica,
+    Database,
     DBObject,
     KafkaSink,
     KafkaSource,
@@ -859,25 +861,72 @@ def run(self, exe: Executor) -> None:
         # Otherwise getting failure on "up" locally
         time.sleep(1)
         self.composition.up("materialized", detach=True)
-        time.sleep(self.rng.uniform(20, 180))
+        time.sleep(self.rng.uniform(120, 240))
 
 
+# TODO: Don't restore immediately, keep copy Database objects
 class BackupRestoreAction(Action):
     composition: Composition
-    exes: list[Executor]
+    databases: list[Database]
+    num: int
 
     def __init__(
-        self,
-        rng: random.Random,
-        composition: Composition,
-        exes: list[Executor]) -> None:
+        self, rng: random.Random, composition: Composition, databases: list[Database]
+    ) -> None:
         super().__init__(rng)
         self.composition = composition
-        self.exes = exes
+        self.databases = databases
+        self.num = 0
 
     def run(self, exe: Executor) -> None:
-        time.sleep(self.rng.uniform(10, 120))
-        # TODO: Backup & restore here
+        self.num += 1
+        time.sleep(self.rng.uniform(10, 240))
+        for db in self.databases:
+            db.lock.acquire()
+
+        try:
+            # Backup
+            self.composition.exec("mc", "mc", "mb", f"persist/crdb-backup{self.num}")
+            self.composition.exec(
+                "cockroach",
+                "cockroach",
+                "sql",
+                "--insecure",
+                "-e",
+                f"""
+               CREATE EXTERNAL CONNECTION backup_bucket{self.num} AS 's3://persist/crdb-backup{self.num}?AWS_ENDPOINT=http://minio:9000/&AWS_REGION=minio&AWS_ACCESS_KEY_ID=minioadmin&AWS_SECRET_ACCESS_KEY=minioadmin';
+               BACKUP INTO 'external://backup_bucket{self.num}';
+            """,
+            )
+            self.composition.kill("materialized")
+
+            # Restore
+            self.composition.exec(
+                "cockroach",
+                "cockroach",
+                "sql",
+                "--insecure",
+                "-e",
+                f"""
+                DROP DATABASE defaultdb;
+                RESTORE DATABASE defaultdb FROM LATEST IN 'external://backup_bucket{self.num}';
+                SELECT shard, min(sequence_number), max(sequence_number)
+                FROM consensus.consensus GROUP BY 1 ORDER BY 2 DESC, 3 DESC, 1 ASC LIMIT 32;
+            """,
+            )
+            self.composition.run(
+                "persistcli",
+                "admin",
+                "--commit",
+                "restore-blob",
+                f"--blob-uri={MINIO_BLOB_URI}",
+                "--consensus-uri=postgres://root@cockroach:26257?options=--search_path=consensus",
+            )
+            self.composition.up("materialized")
+
+        finally:
+            for db in self.databases:
+                db.lock.release()
 
 
 class CreateWebhookSourceAction(Action):
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 8734a5b77da64..16539d1d9a845 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -22,6 +22,7 @@
 from materialize.mzcompose.composition import Composition
 from materialize.parallel_workload.action import (
     Action,
+    BackupRestoreAction,
     CancelAction,
     KillAction,
     ddl_action_list,
@@ -237,7 +238,7 @@ def run(
         assert composition, "Backup & Restore scenario only works in mzcompose"
         worker = Worker(
             worker_rng,
-            [BackupRestoreAction(worker_rng, composition, exes)],
+            [BackupRestoreAction(worker_rng, composition, databases)],
             [1],
             end_time,
             autocommit=False,
diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py
index 13c89a7b45007..c4ee9f7b5974f 100644
--- a/test/parallel-workload/mzcompose.py
+++ b/test/parallel-workload/mzcompose.py
@@ -9,9 +9,11 @@
 
 
 from materialize.mzcompose.composition import Composition, WorkflowArgumentParser
+from materialize.mzcompose.service import Service
 from materialize.mzcompose.services.cockroach import Cockroach
 from materialize.mzcompose.services.kafka import Kafka
 from materialize.mzcompose.services.materialized import Materialized
+from materialize.mzcompose.services.minio import Mc, Minio
 from materialize.mzcompose.services.postgres import Postgres
 from materialize.mzcompose.services.schema_registry import SchemaRegistry
 from materialize.mzcompose.services.zookeeper import Zookeeper
@@ -32,11 +34,18 @@
         ],
     ),
     SchemaRegistry(),
+    Minio(setup_materialize=True),
+    Mc(),
     Materialized(
         external_cockroach=True,
         restart="on-failure",
+        external_minio=True,
         ports=["6975:6875", "6976:6876", "6977:6877"],
     ),
+    Service(
+        name="persistcli",
+        config={"mzbuild": "jobs"},
+    ),
 ]
 
 
@@ -51,9 +60,22 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
         "zookeeper",
         "kafka",
         "schema-registry",
+        "minio",
         "materialized",
     ]
     c.up(*service_names)
+    c.up("mc", persistent=True)
+    c.exec(
+        "mc",
+        "mc",
+        "alias",
+        "set",
+        "persist",
+        "http://minio:9000/",
+        "minioadmin",
+        "minioadmin",
+    )
+    c.exec("mc", "mc", "version", "enable", "persist/persist")
 
     ports = {s: c.default_port(s) for s in service_names}
     ports["http"] = c.port("materialized", 6876)

From a7413be3e45098e4ff4e1c8a8f03f8740d483bc9 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 25 Oct 2023 22:55:08 +0000
Subject: [PATCH 07/17] parallel-workload: Reenable NULLs after 21937 has been
 fixed

---
 misc/python/materialize/data_ingest/data_type.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misc/python/materialize/data_ingest/data_type.py b/misc/python/materialize/data_ingest/data_type.py
index c62cffeb7510c..69e0963fd2e97 100644
--- a/misc/python/materialize/data_ingest/data_type.py
+++ b/misc/python/materialize/data_ingest/data_type.py
@@ -231,7 +231,7 @@ def random_value(
         if rng.randrange(10) == 0:
             result = rng.choice(
                 [
-                    # "NULL", # TODO: Reenable after #21937 is fixed
+                    "NULL",
                     "0.0",
                     "True",
                     # "",

From c7504fa250bb8b5429b16cd578ec20661df5644a Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Thu, 26 Oct 2023 08:40:47 +0000
Subject: [PATCH 08/17] data-ingest: Use QueryError too

---
 misc/python/materialize/data_ingest/executor.py |  5 +++--
 .../materialize/data_ingest/query_error.py      | 17 +++++++++++++++++
 .../materialize/parallel_workload/action.py     |  3 ++-
 .../materialize/parallel_workload/executor.py   | 11 ++---------
 .../materialize/parallel_workload/worker.py     |  3 ++-
 5 files changed, 26 insertions(+), 13 deletions(-)
 create mode 100644 misc/python/materialize/data_ingest/query_error.py

diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py
index d72a1eb668862..ac9dc5758f89c 100644
--- a/misc/python/materialize/data_ingest/executor.py
+++ b/misc/python/materialize/data_ingest/executor.py
@@ -25,6 +25,7 @@
 
 from materialize.data_ingest.data_type import Backend
 from materialize.data_ingest.field import Field, formatted_value
+from materialize.data_ingest.query_error import QueryError
 from materialize.data_ingest.row import Operation
 from materialize.data_ingest.transaction import Transaction
 
@@ -76,9 +77,9 @@ def execute(self, cur: pg8000.Cursor, query: str) -> None:
             self.reconnect()
             with self.mz_conn.cursor() as cur:
                 self.execute(cur, query)
-        except:
+        except Exception as e:
             print(f"Query failed: {query}")
-            raise
+            raise QueryError(str(e), query)
 
     def execute_with_retry_on_error(
         self,
diff --git a/misc/python/materialize/data_ingest/query_error.py b/misc/python/materialize/data_ingest/query_error.py
new file mode 100644
index 0000000000000..713ef31460dcd
--- /dev/null
+++ b/misc/python/materialize/data_ingest/query_error.py
@@ -0,0 +1,17 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+
+
+class QueryError(Exception):
+    msg: str
+    query: str
+
+    def __init__(self, msg: str, query: str):
+        self.msg = msg
+        self.query = query
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 928f07ce6209b..8ad51ae7260de 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -17,6 +17,7 @@
 
 import materialize.parallel_workload.database
 from materialize.data_ingest.data_type import NUMBER_TYPES, Text, TextTextMap
+from materialize.data_ingest.query_error import QueryError
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.services.minio import MINIO_BLOB_URI
 from materialize.parallel_workload.database import (
@@ -45,7 +46,7 @@
     View,
     WebhookSource,
 )
-from materialize.parallel_workload.executor import Executor, QueryError
+from materialize.parallel_workload.executor import Executor
 from materialize.parallel_workload.settings import Complexity, Scenario
 
 if TYPE_CHECKING:
diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py
index 27cd1f4613b73..1159c0bba159c 100644
--- a/misc/python/materialize/parallel_workload/executor.py
+++ b/misc/python/materialize/parallel_workload/executor.py
@@ -13,6 +13,8 @@
 
 import pg8000
 
+from materialize.data_ingest.query_error import QueryError
+
 if TYPE_CHECKING:
     from materialize.parallel_workload.database import Database
 
@@ -26,15 +28,6 @@ def initialize_logging() -> None:
     lock = threading.Lock()
 
 
-class QueryError(Exception):
-    msg: str
-    query: str
-
-    def __init__(self, msg: str, query: str):
-        self.msg = msg
-        self.query = query
-
-
 class Executor:
     rng: random.Random
     cur: pg8000.Cursor
diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py
index 167683db97777..92eecd72fc8cc 100644
--- a/misc/python/materialize/parallel_workload/worker.py
+++ b/misc/python/materialize/parallel_workload/worker.py
@@ -14,9 +14,10 @@
 
 import pg8000
 
+from materialize.data_ingest.query_error import QueryError
 from materialize.parallel_workload.action import Action, ReconnectAction
 from materialize.parallel_workload.database import Database
-from materialize.parallel_workload.executor import Executor, QueryError
+from materialize.parallel_workload.executor import Executor
 
 
 class Worker:

From 3bddb002bdd210516ca567644836c259cd749b74 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Thu, 26 Oct 2023 08:43:36 +0000
Subject: [PATCH 09/17] parallel-workload: Try reenabling 100 threads

---
 ci/nightly/pipeline.template.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index 4b418ff7b9552..a06efbb4b07ac 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -826,7 +826,6 @@ steps:
           - ./ci/plugins/mzcompose:
               composition: parallel-workload
               args: [--runtime=1500, --threads=100]
-        skip: "TODO(def-): Reenable when #21954 is fixed"
 
       - id: parallel-workload-rename-naughty
         label: "Parallel Workload (rename + naughty identifiers)"

From cc63474f4d3fe3545b5b33b8fe2e46c24c667390 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Thu, 26 Oct 2023 10:19:08 +0000
Subject: [PATCH 10/17] Fix connection errors during backup&restore

---
 misc/python/materialize/parallel_workload/action.py         | 6 +++---
 .../materialize/parallel_workload/parallel_workload.py      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 8ad51ae7260de..0ead98a123aa9 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -1152,9 +1152,9 @@ def run(self, exe: Executor) -> None:
         )
         try:
             requests.post(url, data=payload.encode("utf-8"), headers=headers)
-        except requests.exceptions.ConnectionError:
-            # Expeceted when Mz is killed
-            if exe.db.scenario != Scenario.Kill:
+        except (requests.exceptions.ConnectionError):
+            # Expected when Mz is killed
+            if exe.db.scenario not in (Scenario.Kill, Scenario.BackupRestore):
                 raise
 
 
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 16539d1d9a845..30f3fb5652eaf 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -68,7 +68,7 @@ def run(
     random.seed(seed)
 
     print(
-        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} --naughty_identifiers={naughty_identifiers} --databases={num_databases} (--host={host})"
+        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}--databases={num_databases} (--host={host})"
     )
     initialize_logging()
 

From 7ff51d7222c21258577e4af1c94a88697f4dbef7 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Thu, 26 Oct 2023 13:07:06 +0000
Subject: [PATCH 11/17] parallel-workload: Try to reduce memory usage to
 prevent OoM

---
 .../materialize/parallel_workload/action.py   |  7 ++++--
 .../materialize/parallel_workload/database.py | 24 ++++++++++++-------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 0ead98a123aa9..895afa35f1ad0 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -217,8 +217,7 @@ def run(self, exe: Executor) -> None:
             return
         exe.execute(query)
         exe.insert_table = table.table_id
-        with exe.db.lock:
-            table.num_rows += 1
+        table.num_rows += 1
 
 
 class SourceInsertAction(Action):
@@ -230,6 +229,9 @@ def run(self, exe: Executor) -> None:
             source = self.rng.choice(sources)
         with source.lock:
             transaction = next(source.generator)
+            source.num_rows += sum(
+                [len(row_list.rows) for row_list in transaction.row_lists]
+            )
             source.executor.run(transaction)
 
 
@@ -1151,6 +1153,7 @@ def run(self, exe: Executor) -> None:
             f"POST Headers: {', '.join(headers_strs)} Body: {payload.encode('utf-8')}"
         )
         try:
+            source.num_rows += 1
             requests.post(url, data=payload.encode("utf-8"), headers=headers)
         except (requests.exceptions.ConnectionError):
             # Expected when Mz is killed
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index 3f6a39bb5e7a7..50d18dc62f479 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -35,16 +35,16 @@
 MAX_COLUMNS = 100
 MAX_INCLUDE_HEADERS = 5
 MAX_ROWS = 1000
-MAX_CLUSTERS = 10
-MAX_CLUSTER_REPLICAS = 4
+MAX_CLUSTERS = 5
+MAX_CLUSTER_REPLICAS = 3
 MAX_SCHEMAS = 10
-MAX_TABLES = 100
-MAX_VIEWS = 100
-MAX_ROLES = 100
-MAX_WEBHOOK_SOURCES = 20
-MAX_KAFKA_SOURCES = 20
-MAX_POSTGRES_SOURCES = 20
-MAX_KAFKA_SINKS = 20
+MAX_TABLES = 50
+MAX_VIEWS = 50
+MAX_ROLES = 50
+MAX_WEBHOOK_SOURCES = 10
+MAX_KAFKA_SOURCES = 10
+MAX_POSTGRES_SOURCES = 10
+MAX_KAFKA_SINKS = 10
 
 MAX_INITIAL_SCHEMAS = 1
 MAX_INITIAL_CLUSTERS = 2
@@ -329,6 +329,7 @@ class WebhookSource(DBObject):
     explicit_include_headers: list[str]
     check: str | None
     schema: Schema
+    num_rows: int
 
     def __init__(
         self, source_id: int, cluster: "Cluster", schema: Schema, rng: random.Random
@@ -340,6 +341,7 @@ def __init__(
         self.body_format = rng.choice([e for e in BodyFormat])
         self.include_headers = rng.choice([True, False])
         self.explicit_include_headers = []
+        self.num_rows = 0
         self.columns = [
             WebhookColumn(
                 "body",
@@ -409,6 +411,7 @@ class KafkaSource(DBObject):
     lock: threading.Lock
     columns: list[KafkaColumn]
     schema: Schema
+    num_rows: int
 
     def __init__(
         self,
@@ -422,6 +425,7 @@ def __init__(
         self.source_id = source_id
         self.cluster = cluster
         self.schema = schema
+        self.num_rows = 0
         fields = []
         for i in range(rng.randint(1, 10)):
             fields.append(
@@ -525,6 +529,7 @@ class PostgresSource(DBObject):
     lock: threading.Lock
     columns: list[PostgresColumn]
     schema: Schema
+    num_rows: int
 
     def __init__(
         self,
@@ -538,6 +543,7 @@ def __init__(
         self.source_id = source_id
         self.cluster = cluster
         self.schema = schema
+        self.num_rows = 0
         fields = []
         for i in range(rng.randint(1, 10)):
             fields.append(

From 420c6a0799a69b330aabbdcc71c1dcad65acab65 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Thu, 26 Oct 2023 17:42:23 +0000
Subject: [PATCH 12/17] Multiple dbs in same connection

---
 .../materialize/data_ingest/executor.py       |  22 ++--
 .../materialize/parallel_workload/action.py   |  70 +++++++---
 .../materialize/parallel_workload/database.py | 123 ++++++++++--------
 .../materialize/parallel_workload/executor.py |   2 +-
 .../parallel_workload/parallel_workload.py    |  83 +++++-------
 .../materialize/parallel_workload/worker.py   |  56 ++++----
 test/parallel-workload/mzcompose.py           |   1 -
 7 files changed, 183 insertions(+), 174 deletions(-)

diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py
index ac9dc5758f89c..742619fae4543 100644
--- a/misc/python/materialize/data_ingest/executor.py
+++ b/misc/python/materialize/data_ingest/executor.py
@@ -218,10 +218,10 @@ def create(self) -> None:
         with self.mz_conn.cursor() as cur:
             self.execute(
                 cur,
-                f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.table)}
-                    FROM KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}')
+                f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table)}
+                    FROM KAFKA CONNECTION materialize.public.kafka_conn (TOPIC '{self.topic}')
                     FORMAT AVRO
-                    USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn
+                    USING CONFLUENT SCHEMA REGISTRY CONNECTION materialize.public.csr_conn
                     ENVELOPE UPSERT""",
             )
         self.mz_conn.autocommit = False
@@ -334,7 +334,7 @@ def create(self) -> None:
             )
             self.execute(
                 cur,
-                f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.source)}
+                f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.source)}
                     FROM POSTGRES CONNECTION pg{self.num} (PUBLICATION 'postgres_source')
                     FOR TABLES ({identifier(self.table)} AS {identifier(self.table)})""",
             )
@@ -426,13 +426,13 @@ def create(self) -> None:
             self.execute(cur, f"DROP TABLE IF EXISTS {identifier(self.table_original)}")
             self.execute(
                 cur,
-                f"""CREATE TABLE {identifier(self.schema)}.{identifier(self.table_original)} (
+                f"""CREATE TABLE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)} (
                         {", ".join(values)},
                         PRIMARY KEY ({", ".join(keys)}));""",
             )
             self.execute(
                 cur,
-                f"""CREATE SINK {identifier(self.schema)}.sink{self.num} FROM {identifier(self.table_original)}
+                f"""CREATE SINK {identifier(self.database)}.{identifier(self.schema)}.sink{self.num} FROM {identifier(self.table_original)}
                     INTO KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}')
                     KEY ({", ".join([identifier(key) for key in keys])})
                     FORMAT AVRO
@@ -441,7 +441,7 @@ def create(self) -> None:
             )
             self.execute_with_retry_on_error(
                 cur,
-                f"""CREATE SOURCE {identifier(self.schema)}.{identifier(self.table)}
+                f"""CREATE SOURCE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table)}
                     FROM KAFKA CONNECTION kafka_conn (TOPIC '{self.topic}')
                     FORMAT AVRO
                     USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn
@@ -467,7 +467,7 @@ def run(self, transaction: Transaction) -> None:
                         )
                         self.execute(
                             cur,
-                            f"""INSERT INTO {identifier(self.schema)}.{identifier(self.table_original)}
+                            f"""INSERT INTO {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)}
                                 VALUES ({values_str})
                             """,
                         )
@@ -493,7 +493,7 @@ def run(self, transaction: Transaction) -> None:
                                 self.mz_conn.autocommit = True
                                 self.execute(
                                     cur,
-                                    f"""UPDATE {identifier(self.schema)}.{identifier(self.table_original)}
+                                    f"""UPDATE {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)}
                                         SET {set_str}
                                         WHERE {cond_str}
                                     """,
@@ -505,7 +505,7 @@ def run(self, transaction: Transaction) -> None:
                             )
                             self.execute(
                                 cur,
-                                f"""INSERT INTO {identifier(self.schema)}.{identifier(self.table_original)}
+                                f"""INSERT INTO {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)}
                                     VALUES ({values_str})
                                 """,
                             )
@@ -519,7 +519,7 @@ def run(self, transaction: Transaction) -> None:
                         self.mz_conn.autocommit = True
                         self.execute(
                             cur,
-                            f"""DELETE FROM {identifier(self.schema)}.{identifier(self.table_original)}
+                            f"""DELETE FROM {identifier(self.database)}.{identifier(self.schema)}.{identifier(self.table_original)}
                                 WHERE {cond_str}
                             """,
                         )
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index 895afa35f1ad0..aad7c581aaccc 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -21,9 +21,10 @@
 from materialize.mzcompose.composition import Composition
 from materialize.mzcompose.services.minio import MINIO_BLOB_URI
 from materialize.parallel_workload.database import (
-    DB_OFFSET,
+    DB,
     MAX_CLUSTER_REPLICAS,
     MAX_CLUSTERS,
+    MAX_DBS,
     MAX_KAFKA_SINKS,
     MAX_KAFKA_SOURCES,
     MAX_POSTGRES_SOURCES,
@@ -438,6 +439,35 @@ def run(self, exe: Executor) -> None:
                 raise
 
 
+class CreateDatabaseAction(Action):
+    def run(self, exe: Executor) -> None:
+        with exe.db.lock:
+            if len(exe.db.dbs) > MAX_DBS:
+                return
+            db_id = exe.db.db_id
+            exe.db.db_id += 1
+        db = DB(exe.db.seed, db_id)
+        db.create(exe)
+        exe.db.dbs.append(db)
+
+
+class DropDatabaseAction(Action):
+    def errors_to_ignore(self, exe: Executor) -> list[str]:
+        return [
+            "cannot be dropped with RESTRICT while it contains schemas",
+        ] + super().errors_to_ignore(exe)
+
+    def run(self, exe: Executor) -> None:
+        with exe.db.lock:
+            if len(exe.db.dbs) <= 1:
+                return
+            db_id = self.rng.randrange(len(exe.db.dbs))
+            db = exe.db.dbs[db_id]
+            query = f"DROP DATABASE {db} RESTRICT"
+            exe.execute(query)
+            del exe.db.dbs[db_id]
+
+
 class CreateSchemaAction(Action):
     def run(self, exe: Executor) -> None:
         with exe.db.lock:
@@ -445,7 +475,7 @@ def run(self, exe: Executor) -> None:
                 return
             schema_id = exe.db.schema_id
             exe.db.schema_id += 1
-        schema = Schema(self.rng, schema_id)
+        schema = Schema(self.rng.choice(exe.db.dbs), schema_id)
         schema.create(exe)
         exe.db.schemas.append(schema)
 
@@ -589,7 +619,7 @@ def run(self, exe: Executor) -> None:
                 return
             role_id = exe.db.role_id
             exe.db.role_id += 1
-        role = Role(exe.db.db_id * DB_OFFSET + role_id)
+        role = Role(role_id)
         role.create(exe)
         exe.db.roles.append(role)
 
@@ -624,7 +654,7 @@ def run(self, exe: Executor) -> None:
             cluster_id = exe.db.cluster_id
             exe.db.cluster_id += 1
         cluster = Cluster(
-            exe.db.db_id * DB_OFFSET + cluster_id,
+            cluster_id,
             managed=self.rng.choice([True, False]),
             size=self.rng.choice(["1", "2", "4"]),
             replication_factor=self.rng.choice([1, 2, 4, 5]),
@@ -793,7 +823,7 @@ def run(self, exe: Executor) -> None:
         for i in range(NUM_ATTEMPTS):
             try:
                 conn = pg8000.connect(
-                    host=host, port=port, user=user, database=exe.db.name()
+                    host=host, port=port, user=user, database="materialize"
                 )
                 conn.autocommit = autocommit
                 cur = conn.cursor()
@@ -836,10 +866,10 @@ def run(self, exe: Executor) -> None:
         )
         worker = None
         for i in range(len(self.workers)):
-            for worker_exe in self.workers[i].exes:
-                if worker_exe and worker_exe.pg_pid == pid:
-                    worker = f"worker_{i}"
-                    break
+            worker_exe = self.workers[i].exe
+            if worker_exe and worker_exe.pg_pid == pid:
+                worker = f"worker_{i}"
+                break
         assert worker
         exe.execute(
             f"SELECT pg_cancel_backend({pid})", extra_info=f"Canceling {worker}"
@@ -870,22 +900,21 @@ def run(self, exe: Executor) -> None:
 # TODO: Don't restore immediately, keep copy Database objects
 class BackupRestoreAction(Action):
     composition: Composition
-    databases: list[Database]
+    db: Database
     num: int
 
     def __init__(
-        self, rng: random.Random, composition: Composition, databases: list[Database]
+        self, rng: random.Random, composition: Composition, db: Database
     ) -> None:
         super().__init__(rng)
         self.composition = composition
-        self.databases = databases
+        self.db = db
         self.num = 0
 
     def run(self, exe: Executor) -> None:
         self.num += 1
         time.sleep(self.rng.uniform(10, 240))
-        for db in self.databases:
-            db.lock.acquire()
+        self.db.lock.acquire()
 
         try:
             # Backup
@@ -928,8 +957,7 @@ def run(self, exe: Executor) -> None:
             self.composition.up("materialized")
 
         finally:
-            for db in self.databases:
-                db.lock.release()
+            self.db.lock.release()
 
 
 class CreateWebhookSourceAction(Action):
@@ -984,8 +1012,7 @@ def run(self, exe: Executor) -> None:
         schema = self.rng.choice(exe.db.schemas)
         try:
             source = KafkaSource(
-                exe.db.name(),
-                exe.db.db_id * DB_OFFSET + source_id,
+                source_id,
                 cluster,
                 schema,
                 exe.db.ports,
@@ -1035,8 +1062,7 @@ def run(self, exe: Executor) -> None:
             cluster = self.rng.choice(potential_clusters)
         try:
             source = PostgresSource(
-                exe.db.name(),
-                exe.db.db_id * DB_OFFSET + source_id,
+                source_id,
                 cluster,
                 schema,
                 exe.db.ports,
@@ -1091,7 +1117,7 @@ def run(self, exe: Executor) -> None:
             cluster = self.rng.choice(potential_clusters)
             schema = self.rng.choice(exe.db.schemas)
         sink = KafkaSink(
-            exe.db.db_id * DB_OFFSET + sink_id,
+            sink_id,
             cluster,
             schema,
             self.rng.choice(exe.db.db_objects_without_views()),
@@ -1243,6 +1269,8 @@ def __init__(
         (GrantPrivilegesAction, 4),
         (RevokePrivilegesAction, 1),
         (ReconnectAction, 1),
+        (CreateDatabaseAction, 1),
+        (DropDatabaseAction, 1),
         (CreateSchemaAction, 1),
         (DropSchemaAction, 1),
         (RenameSchemaAction, 10),
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index 50d18dc62f479..bfb43c0c4b95e 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -35,17 +35,19 @@
 MAX_COLUMNS = 100
 MAX_INCLUDE_HEADERS = 5
 MAX_ROWS = 1000
-MAX_CLUSTERS = 5
-MAX_CLUSTER_REPLICAS = 3
-MAX_SCHEMAS = 10
-MAX_TABLES = 50
-MAX_VIEWS = 50
-MAX_ROLES = 50
-MAX_WEBHOOK_SOURCES = 10
-MAX_KAFKA_SOURCES = 10
-MAX_POSTGRES_SOURCES = 10
-MAX_KAFKA_SINKS = 10
-
+MAX_CLUSTERS = 10
+MAX_CLUSTER_REPLICAS = 4
+MAX_DBS = 10
+MAX_SCHEMAS = 20
+MAX_TABLES = 100
+MAX_VIEWS = 100
+MAX_ROLES = 100
+MAX_WEBHOOK_SOURCES = 20
+MAX_KAFKA_SOURCES = 20
+MAX_POSTGRES_SOURCES = 20
+MAX_KAFKA_SINKS = 20
+
+MAX_INITIAL_DBS = 1
 MAX_INITIAL_SCHEMAS = 1
 MAX_INITIAL_CLUSTERS = 2
 MAX_INITIAL_TABLES = 10
@@ -56,8 +58,6 @@
 MAX_INITIAL_POSTGRES_SOURCES = 3
 MAX_INITIAL_KAFKA_SINKS = 3
 
-DB_OFFSET = 1_000_000
-
 NAUGHTY_IDENTIFIERS = False
 
 
@@ -137,12 +137,35 @@ def create(self) -> str:
         return result
 
 
+class DB:
+    seed: str
+    db_id: int
+
+    def __init__(self, seed: str, db_id: int):
+        self.seed = seed
+        self.db_id = db_id
+
+    def name(self) -> str:
+        return naughtify(f"db-pw-{self.seed}-{self.db_id}")
+
+    def __str__(self) -> str:
+        return identifier(self.name())
+
+    def create(self, exe: Executor) -> None:
+        exe.execute(f"CREATE DATABASE {self}")
+
+    def drop(self, exe: Executor) -> None:
+        exe.execute(f"DROP DATABASE IF EXISTS {self}")
+
+
 class Schema:
     schema_id: int
     rename: int
+    db: DB
 
-    def __init__(self, rng: random.Random, schema_id: int):
+    def __init__(self, db: DB, schema_id: int):
         self.schema_id = schema_id
+        self.db = db
         self.rename = 0
 
     def name(self) -> str:
@@ -151,7 +174,7 @@ def name(self) -> str:
         return naughtify(f"s-{self.schema_id}")
 
     def __str__(self) -> str:
-        return identifier(self.name())
+        return f"{self.db}.{identifier(self.name())}"
 
     def create(self, exe: Executor) -> None:
         query = f"CREATE SCHEMA {self}"
@@ -415,7 +438,6 @@ class KafkaSource(DBObject):
 
     def __init__(
         self,
-        database: str,
         source_id: int,
         cluster: "Cluster",
         schema: Schema,
@@ -438,7 +460,7 @@ def __init__(
             KafkaColumn(field.name, field.data_type, False, self) for field in fields
         ]
         self.executor = KafkaExecutor(
-            self.source_id, ports, fields, database, schema.name()
+            self.source_id, ports, fields, schema.db.name(), schema.name()
         )
         self.generator = rng.choice(list(WORKLOADS))(None).generate(fields)
         self.lock = threading.Lock()
@@ -533,7 +555,6 @@ class PostgresSource(DBObject):
 
     def __init__(
         self,
-        database: str,
         source_id: int,
         cluster: "Cluster",
         schema: Schema,
@@ -556,7 +577,7 @@ def __init__(
             PostgresColumn(field.name, field.data_type, False, self) for field in fields
         ]
         self.executor = PgExecutor(
-            self.source_id, ports, fields, database, schema.name()
+            self.source_id, ports, fields, schema.db.name(), schema.name()
         )
         self.generator = rng.choice(list(WORKLOADS))(None).generate(fields)
         self.lock = threading.Lock()
@@ -659,13 +680,14 @@ def create(self, exe: Executor) -> None:
         exe.execute(query)
 
 
+# TODO: Can access both databases from same connection!
 class Database:
-    db_id: int
-    seed: str
     complexity: Complexity
     scenario: Scenario
     host: str
     ports: dict[str, int]
+    dbs: list[DB]
+    db_id: int
     schemas: list[Schema]
     schema_id: int
     tables: list[Table]
@@ -686,10 +708,10 @@ class Database:
     kafka_sinks: list[KafkaSink]
     kafka_sink_id: int
     lock: threading.Lock
+    seed: str
 
     def __init__(
         self,
-        db_id: int,
         rng: random.Random,
         seed: str,
         host: str,
@@ -699,16 +721,18 @@ def __init__(
         naughty_identifiers: bool,
     ):
         global NAUGHTY_IDENTIFIERS
-        self.db_id = db_id
-        self.seed = seed
         self.host = host
         self.ports = ports
         self.complexity = complexity
         self.scenario = scenario
+        self.seed = seed
         NAUGHTY_IDENTIFIERS = naughty_identifiers
 
+        self.dbs = [DB(seed, i) for i in range(rng.randint(1, MAX_INITIAL_DBS))]
+        self.db_id = len(self.dbs)
         self.schemas = [
-            Schema(rng, i) for i in range(rng.randint(1, MAX_INITIAL_SCHEMAS))
+            Schema(rng.choice(self.dbs), i)
+            for i in range(rng.randint(1, MAX_INITIAL_SCHEMAS))
         ]
         self.schema_id = len(self.schemas)
         self.tables = [
@@ -727,15 +751,12 @@ def __init__(
             view = View(rng, i, base_object, base_object2, rng.choice(self.schemas))
             self.views.append(view)
         self.view_id = len(self.views)
-        self.roles = [
-            Role(db_id * DB_OFFSET + i)
-            for i in range(rng.randint(0, MAX_INITIAL_ROLES))
-        ]
+        self.roles = [Role(i) for i in range(rng.randint(0, MAX_INITIAL_ROLES))]
         self.role_id = len(self.roles)
         # At least one storage cluster required for WebhookSources
         self.clusters = [
             Cluster(
-                self.db_id * DB_OFFSET + i,
+                i,
                 managed=rng.choice([True, False]),
                 size=rng.choice(["1", "2", "4"]),
                 replication_factor=1,
@@ -752,8 +773,7 @@ def __init__(
         self.webhook_source_id = len(self.webhook_sources)
         self.kafka_sources = [
             KafkaSource(
-                self.name(),
-                self.db_id * DB_OFFSET + i,
+                i,
                 rng.choice(self.clusters),
                 rng.choice(self.schemas),
                 ports,
@@ -764,8 +784,7 @@ def __init__(
         self.kafka_source_id = len(self.kafka_sources)
         self.postgres_sources = [
             PostgresSource(
-                self.name(),
-                self.db_id * DB_OFFSET + i,
+                i,
                 rng.choice(self.clusters),
                 rng.choice(self.schemas),
                 ports,
@@ -787,12 +806,6 @@ def __init__(
         self.kafka_sink_id = len(self.kafka_sinks)
         self.lock = threading.Lock()
 
-    def name(self) -> str:
-        return naughtify(f"db-pw-{self.seed}-{self.db_id}")
-
-    def __str__(self) -> str:
-        return identifier(self.name())
-
     def db_objects(
         self,
     ) -> list[WebhookSource | PostgresSource | KafkaSource | View | Table]:
@@ -817,28 +830,24 @@ def __iter__(self):
             self.schemas + self.clusters + self.roles + self.db_objects()
         ).__iter__()
 
-    def drop(self, exe: Executor) -> None:
-        exe.execute(f"DROP DATABASE IF EXISTS {self}")
-
     def create(self, exe: Executor) -> None:
-        self.drop(exe)
-        exe.execute(f"CREATE DATABASE {self}")
-        exe.execute(f"ALTER DATABASE {self} OWNER TO materialize")
+        for db in self.dbs:
+            db.drop(exe)
+            db.create(exe)
 
-    def create_relations(self, exe: Executor) -> None:
-        # Roles and clusters are system wide, not per DB
-        if self.db_id == 0:
-            exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
-            for row in exe.cur.fetchall():
-                exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE")
+        exe.execute("SELECT name FROM mz_clusters WHERE name LIKE 'c%'")
+        for row in exe.cur.fetchall():
+            exe.execute(f"DROP CLUSTER {identifier(row[0])} CASCADE")
 
-            exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
-            for row in exe.cur.fetchall():
-                exe.execute(f"DROP ROLE {identifier(row[0])}")
+        exe.execute("SELECT name FROM mz_roles WHERE name LIKE 'r%'")
+        for row in exe.cur.fetchall():
+            exe.execute(f"DROP ROLE {identifier(row[0])}")
 
-        exe.execute("CREATE CONNECTION kafka_conn FOR KAFKA BROKER 'kafka:9092'")
         exe.execute(
-            "CREATE CONNECTION csr_conn FOR CONFLUENT SCHEMA REGISTRY URL 'http://schema-registry:8081'"
+            "CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA BROKER 'kafka:9092'"
+        )
+        exe.execute(
+            "CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA REGISTRY URL 'http://schema-registry:8081'"
         )
         print("Created connections")
 
diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py
index 1159c0bba159c..db7910b96a565 100644
--- a/misc/python/materialize/parallel_workload/executor.py
+++ b/misc/python/materialize/parallel_workload/executor.py
@@ -73,7 +73,7 @@ def log(self, msg: str) -> None:
         thread_name = threading.current_thread().getName()
 
         with lock:
-            print(f"[{thread_name}][{self.db.name()}] {msg}", file=logging)
+            print(f"[{thread_name}] {msg}", file=logging)
             logging.flush()
 
     def execute(
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 30f3fb5652eaf..029ae47861fa2 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -61,14 +61,13 @@ def run(
     scenario: Scenario,
     num_threads: int | None,
     naughty_identifiers: bool,
-    num_databases: int,
     composition: Composition | None,
 ) -> None:
     num_threads = num_threads or os.cpu_count() or 10
     random.seed(seed)
 
     print(
-        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}--databases={num_databases} (--host={host})"
+        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})"
     )
     initialize_logging()
 
@@ -77,40 +76,29 @@ def run(
     ).timestamp()
 
     rng = random.Random(random.randrange(SEED_RANGE))
-    databases = [
-        Database(i, rng, seed, host, ports, complexity, scenario, naughty_identifiers)
-        for i in range(num_databases)
-    ]
+    database = Database(
+        rng, seed, host, ports, complexity, scenario, naughty_identifiers
+    )
 
     system_conn = pg8000.connect(
         host=host, port=ports["mz_system"], user="mz_system", database="materialize"
     )
     system_conn.autocommit = True
     with system_conn.cursor() as system_cur:
-        system_exe = Executor(rng, system_cur, databases[0])
+        system_exe = Executor(rng, system_cur, database)
         system_exe.execute("ALTER SYSTEM SET enable_webhook_sources TO true")
         system_exe.execute(
             f"ALTER SYSTEM SET max_schemas_per_database = {MAX_SCHEMAS * 2}"
         )
         # The presence of ALTER TABLE RENAME can cause the total number of tables to exceed MAX_TABLES
+        system_exe.execute(f"ALTER SYSTEM SET max_tables = {MAX_TABLES * 2}")
+        system_exe.execute(f"ALTER SYSTEM SET max_materialized_views = {MAX_VIEWS * 2}")
         system_exe.execute(
-            f"ALTER SYSTEM SET max_tables = {len(databases) * MAX_TABLES * 2}"
-        )
-        system_exe.execute(
-            f"ALTER SYSTEM SET max_materialized_views = {len(databases) * MAX_VIEWS * 2}"
-        )
-        system_exe.execute(
-            f"ALTER SYSTEM SET max_sources = {len(databases) * (MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}"
-        )
-        system_exe.execute(
-            f"ALTER SYSTEM SET max_sinks = {len(databases) * MAX_KAFKA_SINKS * 2}"
-        )
-        system_exe.execute(
-            f"ALTER SYSTEM SET max_roles = {len(databases) * MAX_ROLES * 2}"
-        )
-        system_exe.execute(
-            f"ALTER SYSTEM SET max_clusters = {len(databases) * MAX_CLUSTERS * 2}"
+            f"ALTER SYSTEM SET max_sources = {(MAX_WEBHOOK_SOURCES + MAX_KAFKA_SOURCES + MAX_POSTGRES_SOURCES) * 2}"
         )
+        system_exe.execute(f"ALTER SYSTEM SET max_sinks = {MAX_KAFKA_SINKS * 2}")
+        system_exe.execute(f"ALTER SYSTEM SET max_roles = {MAX_ROLES * 2}")
+        system_exe.execute(f"ALTER SYSTEM SET max_clusters = {MAX_CLUSTERS * 2}")
         system_exe.execute(
             f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}"
         )
@@ -136,20 +124,17 @@ def run(
         system_exe.execute(
             "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC"
         )
-        for database in databases:
-            database.create(system_exe)
-
-            conn = pg8000.connect(
-                host=host,
-                port=ports["materialized"],
-                user="materialize",
-                database=database.name(),
-            )
-            conn.autocommit = True
-            with conn.cursor() as cur:
-                database.create_relations(Executor(rng, cur, database))
-            conn.close()
-    system_conn.close()
+        system_conn.close()
+        conn = pg8000.connect(
+            host=host,
+            port=ports["materialized"],
+            user="materialize",
+            database="materialize",
+        )
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            database.create(Executor(rng, cur, database))
+        conn.close()
 
     workers = []
     threads = []
@@ -194,7 +179,7 @@ def run(
         thread = threading.Thread(
             name=thread_name,
             target=worker.run,
-            args=(host, ports["materialized"], "materialize", databases),
+            args=(host, ports["materialized"], "materialize", database),
         )
         thread.start()
         threads.append(thread)
@@ -212,7 +197,7 @@ def run(
         thread = threading.Thread(
             name="cancel",
             target=worker.run,
-            args=(host, ports["mz_system"], "mz_system", databases),
+            args=(host, ports["mz_system"], "mz_system", database),
         )
         thread.start()
         threads.append(thread)
@@ -230,7 +215,7 @@ def run(
         thread = threading.Thread(
             name="kill",
             target=worker.run,
-            args=(host, ports["materialized"], "materialize", databases),
+            args=(host, ports["materialized"], "materialize", database),
         )
         thread.start()
         threads.append(thread)
@@ -238,7 +223,7 @@ def run(
         assert composition, "Backup & Restore scenario only works in mzcompose"
         worker = Worker(
             worker_rng,
-            [BackupRestoreAction(worker_rng, composition, databases)],
+            [BackupRestoreAction(worker_rng, composition, database)],
             [1],
             end_time,
             autocommit=False,
@@ -248,7 +233,7 @@ def run(
         thread = threading.Thread(
             name="kill",
             target=worker.run,
-            args=(host, ports["materialized"], "materialize", databases),
+            args=(host, ports["materialized"], "materialize", database),
         )
         thread.start()
         threads.append(thread)
@@ -286,9 +271,10 @@ def run(
     conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
     conn.autocommit = True
     with conn.cursor() as cur:
-        for database in databases:
-            print(f"Dropping database {database}")
-            database.drop(Executor(rng, cur, database))
+        exe = Executor(rng, cur, database)
+        print(f"Dropping database {database}")
+        for db in database.dbs:
+            db.drop(exe)
     conn.close()
 
     ignored_errors: defaultdict[str, Counter[type[Action]]] = defaultdict(Counter)
@@ -336,12 +322,6 @@ def parse_common_args(parser: argparse.ArgumentParser) -> None:
         action="store_true",
         help="Whether to use naughty strings as identifiers, makes the queries unreadable",
     )
-    parser.add_argument(
-        "--databases",
-        default=2,
-        type=int,
-        help="Number of databases to create and run against, 2 by default",
-    )
 
 
 def main() -> int:
@@ -390,7 +370,6 @@ def main() -> int:
         Scenario(args.scenario),
         args.threads,
         args.naughty_identifiers,
-        args.databases,
         composition=None,  # only works in mzcompose
     )
     return 0
diff --git a/misc/python/materialize/parallel_workload/worker.py b/misc/python/materialize/parallel_workload/worker.py
index 92eecd72fc8cc..83b7ac3e325f0 100644
--- a/misc/python/materialize/parallel_workload/worker.py
+++ b/misc/python/materialize/parallel_workload/worker.py
@@ -28,7 +28,7 @@ class Worker:
     num_queries: int
     autocommit: bool
     system: bool
-    exes: list[Executor]
+    exe: Executor | None
     ignored_errors: defaultdict[str, Counter[type[Action]]]
 
     def __init__(
@@ -48,30 +48,26 @@ def __init__(
         self.autocommit = autocommit
         self.system = system
         self.ignored_errors = defaultdict(Counter)
-        self.exes = []
+        self.exe = None
 
-    def run(self, host: str, port: int, user: str, databases: list[Database]) -> None:
-        self.conns = [
-            pg8000.connect(host=host, port=port, user=user, database=database.name())
-            for database in databases
-        ]
-        for database, conn in zip(databases, self.conns):
-            conn.autocommit = self.autocommit
-            cur = conn.cursor()
-            exe = Executor(self.rng, cur, database)
-            exe.set_isolation("SERIALIZABLE")
-            cur.execute("SELECT pg_backend_pid()")
-            exe.pg_pid = cur.fetchall()[0][0]
-            self.exes.append(exe)
+    def run(self, host: str, port: int, user: str, database: Database) -> None:
+        self.conn = pg8000.connect(
+            host=host, port=port, user=user, database="materialize"
+        )
+        self.conn.autocommit = self.autocommit
+        cur = self.conn.cursor()
+        self.exe = Executor(self.rng, cur, database)
+        self.exe.set_isolation("SERIALIZABLE")
+        cur.execute("SELECT pg_backend_pid()")
+        self.exe.pg_pid = cur.fetchall()[0][0]
 
         while time.time() < self.end_time:
-            exe = self.rng.choice(self.exes)
             action = self.rng.choices(self.actions, self.weights)[0]
             self.num_queries += 1
             try:
-                if exe.rollback_next:
+                if self.exe.rollback_next:
                     try:
-                        exe.rollback()
+                        self.exe.rollback()
                     except QueryError as e:
                         if (
                             "Please disconnect and re-connect" in e.msg
@@ -79,16 +75,16 @@ def run(self, host: str, port: int, user: str, databases: list[Database]) -> Non
                             or "Can't create a connection to host" in e.msg
                             or "Connection refused" in e.msg
                         ):
-                            exe.reconnect_next = True
-                            exe.rollback_next = False
+                            self.exe.reconnect_next = True
+                            self.exe.rollback_next = False
                             continue
-                    exe.rollback_next = False
-                if exe.reconnect_next:
-                    ReconnectAction(self.rng, random_role=False).run(exe)
-                    exe.reconnect_next = False
-                action.run(exe)
+                    self.exe.rollback_next = False
+                if self.exe.reconnect_next:
+                    ReconnectAction(self.rng, random_role=False).run(self.exe)
+                    self.exe.reconnect_next = False
+                action.run(self.exe)
             except QueryError as e:
-                for error in action.errors_to_ignore(exe):
+                for error in action.errors_to_ignore(self.exe):
                     if error in e.msg:
                         self.ignored_errors[error][type(action)] += 1
                         if (
@@ -97,13 +93,11 @@ def run(self, host: str, port: int, user: str, databases: list[Database]) -> Non
                             or "Can't create a connection to host" in e.msg
                             or "Connection refused" in e.msg
                         ):
-                            exe.reconnect_next = True
+                            self.exe.reconnect_next = True
                         else:
-                            exe.rollback_next = True
+                            self.exe.rollback_next = True
                         break
                 else:
                     thread_name = threading.current_thread().getName()
-                    print(
-                        f"[{thread_name}][{exe.db.name()}] Query failed: {e.query} {e.msg}"
-                    )
+                    print(f"[{thread_name}] Query failed: {e.query} {e.msg}")
                     raise
diff --git a/test/parallel-workload/mzcompose.py b/test/parallel-workload/mzcompose.py
index c4ee9f7b5974f..50dcadb0dd6cd 100644
--- a/test/parallel-workload/mzcompose.py
+++ b/test/parallel-workload/mzcompose.py
@@ -90,7 +90,6 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
         Scenario(args.scenario),
         args.threads,
         args.naughty_identifiers,
-        args.databases,
         c,
     )
     # TODO: Only ignore errors that will be handled by parallel-workload, not others

From 94f10607be5aa992e85e7a0d426b6e6c8e66f973 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 27 Oct 2023 10:51:45 +0000
Subject: [PATCH 13/17] parallel-workload: Handle stuck queries

---
 .../materialize/data_ingest/executor.py       |  2 +-
 .../materialize/parallel_workload/action.py   | 92 ++++++++++---------
 .../materialize/parallel_workload/executor.py |  3 +
 .../parallel_workload/parallel_workload.py    | 15 ++-
 4 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/misc/python/materialize/data_ingest/executor.py b/misc/python/materialize/data_ingest/executor.py
index 742619fae4543..33e269fc6962c 100644
--- a/misc/python/materialize/data_ingest/executor.py
+++ b/misc/python/materialize/data_ingest/executor.py
@@ -78,7 +78,7 @@ def execute(self, cur: pg8000.Cursor, query: str) -> None:
             with self.mz_conn.cursor() as cur:
                 self.execute(cur, query)
         except Exception as e:
-            print(f"Query failed: {query}")
+            print(f"Query failed: {query} {e}")
             raise QueryError(str(e), query)
 
     def execute_with_retry_on_error(
diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index aad7c581aaccc..cdab0052c5a5f 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -516,7 +516,9 @@ def run(self, exe: Executor) -> None:
             old_name = str(schema)
             schema.rename += 1
             try:
-                exe.execute(f"ALTER SCHEMA {old_name} RENAME TO {schema}")
+                exe.execute(
+                    f"ALTER SCHEMA {old_name} RENAME TO {identifier(schema.name())}"
+                )
             except:
                 schema.rename -= 1
                 raise
@@ -524,24 +526,26 @@ def run(self, exe: Executor) -> None:
 
 class SwapSchemaAction(Action):
     def run(self, exe: Executor) -> None:
-        if self.db.scenario != Scenario.Rename:
+        if exe.db.scenario != Scenario.Rename:
             return
-        with self.db.lock:
-            if len(self.db.schemas) < 2:
+        with exe.db.lock:
+            db = self.rng.choice(exe.db.dbs)
+            schemas = [schema for schema in exe.db.schemas if schema.db == db]
+            if len(schemas) < 2:
                 return
-            (i1, schema1), (i2, schema2) = self.rng.sample(
-                list(enumerate(self.db.schemas)), 2
-            )
-            self.db.schemas[i1], self.db.schemas[i2] = (
-                self.db.schemas[i2],
-                self.db.schemas[i1],
+            (i1, schema1), (i2, schema2) = self.rng.sample(list(enumerate(schemas)), 2)
+            exe.db.schemas[i1], exe.db.schemas[i2] = (
+                exe.db.schemas[i2],
+                exe.db.schemas[i1],
             )
             try:
-                exe.execute(f"ALTER SCHEMA {schema1} SWAP WITH {schema2}")
+                exe.execute(
+                    f"ALTER SCHEMA {schema1} SWAP WITH {identifier(schema2.name())}"
+                )
             except:
-                self.db.schemas[i1], self.db.schemas[i2] = (
-                    self.db.schemas[i2],
-                    self.db.schemas[i1],
+                exe.db.schemas[i1], exe.db.schemas[i2] = (
+                    exe.db.schemas[i2],
+                    exe.db.schemas[i1],
                 )
                 raise
 
@@ -862,7 +866,7 @@ def __init__(
 
     def run(self, exe: Executor) -> None:
         pid = self.rng.choice(
-            [exe.pg_pid for worker in self.workers for exe in worker.exes if exe and exe.pg_pid != -1]  # type: ignore
+            [worker.exe.pg_pid for worker in self.workers if worker.exe and worker.exe.pg_pid != -1]  # type: ignore
         )
         worker = None
         for i in range(len(self.workers)):
@@ -1007,22 +1011,22 @@ def run(self, exe: Executor) -> None:
                 return
             source_id = exe.db.kafka_source_id
             exe.db.kafka_source_id += 1
-        potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
-        cluster = self.rng.choice(potential_clusters)
-        schema = self.rng.choice(exe.db.schemas)
-        try:
-            source = KafkaSource(
-                source_id,
-                cluster,
-                schema,
-                exe.db.ports,
-                self.rng,
-            )
-            source.create(exe)
-            exe.db.kafka_sources.append(source)
-        except:
-            if exe.db.scenario != Scenario.Kill:
-                raise
+            potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
+            cluster = self.rng.choice(potential_clusters)
+            schema = self.rng.choice(exe.db.schemas)
+            try:
+                source = KafkaSource(
+                    source_id,
+                    cluster,
+                    schema,
+                    exe.db.ports,
+                    self.rng,
+                )
+                source.create(exe)
+                exe.db.kafka_sources.append(source)
+            except:
+                if exe.db.scenario != Scenario.Kill:
+                    raise
 
 
 class DropKafkaSourceAction(Action):
@@ -1060,19 +1064,19 @@ def run(self, exe: Executor) -> None:
             potential_clusters = [c for c in exe.db.clusters if len(c.replicas) == 1]
             schema = self.rng.choice(exe.db.schemas)
             cluster = self.rng.choice(potential_clusters)
-        try:
-            source = PostgresSource(
-                source_id,
-                cluster,
-                schema,
-                exe.db.ports,
-                self.rng,
-            )
-            source.create(exe)
-            exe.db.postgres_sources.append(source)
-        except:
-            if exe.db.scenario != Scenario.Kill:
-                raise
+            try:
+                source = PostgresSource(
+                    source_id,
+                    cluster,
+                    schema,
+                    exe.db.ports,
+                    self.rng,
+                )
+                source.create(exe)
+                exe.db.postgres_sources.append(source)
+            except:
+                if exe.db.scenario != Scenario.Kill:
+                    raise
 
 
 class DropPostgresSourceAction(Action):
diff --git a/misc/python/materialize/parallel_workload/executor.py b/misc/python/materialize/parallel_workload/executor.py
index db7910b96a565..b00b13aa2e1c3 100644
--- a/misc/python/materialize/parallel_workload/executor.py
+++ b/misc/python/materialize/parallel_workload/executor.py
@@ -37,6 +37,7 @@ class Executor:
     db: "Database"
     reconnect_next: bool
     rollback_next: bool
+    last_log: str
 
     def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"):
         self.rng = rng
@@ -46,6 +47,7 @@ def __init__(self, rng: random.Random, cur: pg8000.Cursor, db: "Database"):
         self.insert_table = None
         self.reconnect_next = True
         self.rollback_next = True
+        self.last_log = ""
 
     def set_isolation(self, level: str) -> None:
         self.execute(f"SET TRANSACTION_ISOLATION TO '{level}'")
@@ -71,6 +73,7 @@ def log(self, msg: str) -> None:
             return
 
         thread_name = threading.current_thread().getName()
+        self.last_log = msg
 
         with lock:
             print(f"[{thread_name}] {msg}", file=logging)
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 029ae47861fa2..5895870d3b8e1 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -265,8 +265,19 @@ def run(
         for worker in workers:
             worker.end_time = time.time()
 
-    for thread in threads:
-        thread.join()
+    stopping_time = (
+        datetime.datetime.now() + datetime.timedelta(seconds=300)
+    ).timestamp()
+    while time.time() < stopping_time:
+        for worker, thread in zip(workers, threads):
+            thread.join(timeout=1)
+            if thread.is_alive():
+                print(f"{thread.name} still running: {worker.exe.last_log}")
+        if all([not thread.is_alive() for thread in threads]):
+            break
+    else:
+        print("Threads have not stopped within 5 minutes, exiting hard")
+        sys.exit(1)
 
     conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
     conn.autocommit = True

From 4509680454ecf82a836bc91d1357e567579f89b6 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 27 Oct 2023 10:53:57 +0000
Subject: [PATCH 14/17] Address reviewer comment

---
 .../parallel_workload/parallel_workload.py    | 33 +++++++------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 5895870d3b8e1..86203548de8dc 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -103,27 +103,18 @@ def run(
             f"ALTER SYSTEM SET max_replicas_per_cluster = {MAX_CLUSTER_REPLICAS * 2}"
         )
         # Most queries should not fail because of privileges
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TABLES TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON TYPES TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SECRETS TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CONNECTIONS TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON DATABASES TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON SCHEMAS TO PUBLIC"
-        )
-        system_exe.execute(
-            "ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON CLUSTERS TO PUBLIC"
-        )
+        for object_type in [
+            "TABLES",
+            "TYPES",
+            "SECRETS",
+            "CONNECTIONS",
+            "DATABASES",
+            "SCHEMAS",
+            "CLUSTERS",
+        ]:
+            system_exe.execute(
+                f"ALTER DEFAULT PRIVILEGES FOR ALL ROLES GRANT ALL PRIVILEGES ON {object_type} TO PUBLIC"
+            )
         system_conn.close()
         conn = pg8000.connect(
             host=host,

From 28847a20d087c5d09aa2118c8e581b6e2925ad7c Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 27 Oct 2023 14:56:48 +0000
Subject: [PATCH 15/17] Better handling of existing errors

---
 .../materialize/parallel_workload/action.py   | 50 ++++++++++---------
 .../parallel_workload/parallel_workload.py    |  9 ++--
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/action.py b/misc/python/materialize/parallel_workload/action.py
index cdab0052c5a5f..4043488c22b71 100644
--- a/misc/python/materialize/parallel_workload/action.py
+++ b/misc/python/materialize/parallel_workload/action.py
@@ -88,7 +88,7 @@ def errors_to_ignore(self, exe: Executor) -> list[str]:
                     "canceling statement due to user request",
                 ]
             )
-        if exe.db.scenario == Scenario.Kill:
+        if exe.db.scenario in (Scenario.Kill, Scenario.BackupRestore):
             result.extend(
                 [
                     "network error",
@@ -339,9 +339,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             exe.db.indexes.remove(index_name)
@@ -374,9 +374,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             exe.db.tables.remove(table)
@@ -573,8 +573,10 @@ def run(self, exe: Executor) -> None:
             exe.db.view_id += 1
         # Don't use views for now since LIMIT 1 and statement_timeout are
         # not effective yet at preventing long-running queries and OoMs.
-        base_object = self.rng.choice(exe.db.db_objects())
-        base_object2: DBObject | None = self.rng.choice(exe.db.db_objects())
+        base_object = self.rng.choice(exe.db.db_objects_without_views())
+        base_object2: DBObject | None = self.rng.choice(
+            exe.db.db_objects_without_views()
+        )
         if self.rng.choice([True, False]) or base_object2 == base_object:
             base_object2 = None
         view = View(
@@ -608,9 +610,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             del exe.db.views[view_id]
@@ -996,9 +998,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             del exe.db.webhook_sources[source_id]
@@ -1046,9 +1048,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             del exe.db.kafka_sources[source_id]
@@ -1096,9 +1098,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             del exe.db.postgres_sources[source_id]
@@ -1148,9 +1150,9 @@ def run(self, exe: Executor) -> None:
                 exe.execute(query)
             except QueryError as e:
                 # expected, see #20465
-                if (
-                    exe.db.scenario != Scenario.Kill
-                    or "unknown catalog item" not in e.msg
+                if exe.db.scenario != Scenario.Kill or (
+                    "unknown catalog item" not in e.msg
+                    and "unknown schema" not in e.msg
                 ):
                     raise e
             del exe.db.kafka_sinks[sink_id]
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 86203548de8dc..9ad024b3a37d0 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -260,15 +260,16 @@ def run(
         datetime.datetime.now() + datetime.timedelta(seconds=300)
     ).timestamp()
     while time.time() < stopping_time:
-        for worker, thread in zip(workers, threads):
+        for thread in threads:
             thread.join(timeout=1)
-            if thread.is_alive():
-                print(f"{thread.name} still running: {worker.exe.last_log}")
         if all([not thread.is_alive() for thread in threads]):
             break
     else:
+        for worker, thread in zip(workers, threads):
+            if thread.is_alive():
+                print(f"{thread.name} still running: {worker.exe.last_log}")
         print("Threads have not stopped within 5 minutes, exiting hard")
-        sys.exit(1)
+        os._exit(1)
 
     conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
     conn.autocommit = True

From 7ae02eb1670234fa45b2742385fe11e84c4b0341 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 27 Oct 2023 14:55:34 +0000
Subject: [PATCH 16/17] Make parallel-workload reproducible

hash and set/dict order are not deterministic between runs
---
 .../python/materialize/data_ingest/data_type.py | 17 ++++++++++-------
 .../materialize/parallel_workload/database.py   |  4 +++-
 .../parallel_workload/parallel_workload.py      |  5 ++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/misc/python/materialize/data_ingest/data_type.py b/misc/python/materialize/data_ingest/data_type.py
index 69e0963fd2e97..ece463df8d8e9 100644
--- a/misc/python/materialize/data_ingest/data_type.py
+++ b/misc/python/materialize/data_ingest/data_type.py
@@ -60,7 +60,7 @@ def random_value(
         record_size: RecordSize = RecordSize.LARGE,
         in_query: bool = False,
     ) -> Any:
-        return random.choice((True, False))
+        return rng.choice((True, False))
 
     @staticmethod
     def name(backend: Backend = Backend.POSTGRES) -> str:
@@ -243,13 +243,13 @@ def random_value(
         # chars = string.printable
         chars = string.ascii_letters + string.digits
         if record_size == RecordSize.TINY:
-            result = random.choice(("foo", "bar", "baz"))
+            result = rng.choice(("foo", "bar", "baz"))
         elif record_size == RecordSize.SMALL:
-            result = "".join(random.choice(chars) for _ in range(3))
+            result = "".join(rng.choice(chars) for _ in range(3))
         elif record_size == RecordSize.MEDIUM:
-            result = "".join(random.choice(chars) for _ in range(10))
+            result = "".join(rng.choice(chars) for _ in range(10))
         elif record_size == RecordSize.LARGE:
-            result = "".join(random.choice(chars) for _ in range(100))
+            result = "".join(rng.choice(chars) for _ in range(100))
         else:
             raise ValueError(f"Unexpected record size {record_size}")
 
@@ -357,10 +357,13 @@ def numeric_value(num: int, in_query: bool = False) -> Any:
         return f"'{values_str}'::map[text=>text]" if in_query else values_str
 
 
-DATA_TYPES = list(all_subclasses(DataType))
+# Sort to keep determinism for reproducible runs with specific seed
+DATA_TYPES = sorted(list(all_subclasses(DataType)), key=repr)
 
 # fastavro._schema_common.UnknownType: record
 # bytea requires Python bytes type instead of str
-DATA_TYPES_FOR_AVRO = list(set(DATA_TYPES) - {TextTextMap, Jsonb, Bytea, Boolean})
+DATA_TYPES_FOR_AVRO = sorted(
+    list(set(DATA_TYPES) - {TextTextMap, Jsonb, Bytea, Boolean}), key=repr
+)
 
 NUMBER_TYPES = [SmallInt, Int, Long, Float, Double]
diff --git a/misc/python/materialize/parallel_workload/database.py b/misc/python/materialize/parallel_workload/database.py
index bfb43c0c4b95e..edfeb49fd4bef 100644
--- a/misc/python/materialize/parallel_workload/database.py
+++ b/misc/python/materialize/parallel_workload/database.py
@@ -71,7 +71,9 @@ def naughtify(name: str) -> str:
 
     strings = naughty_strings()
     # This rng is just to get a more interesting integer for the name
-    index = abs(hash(name)) % len(strings)
+    index = sum([10**i * c for i, c in enumerate(name.encode("utf-8"))]) % len(
+        strings
+    )
     # Keep them short so we can combine later with other identifiers, 255 char limit
     return f"{name}_{strings[index].encode('utf-8')[:16].decode('utf-8', 'ignore')}"
 
diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 9ad024b3a37d0..2d2efe766b3f0 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -129,7 +129,6 @@ def run(
 
     workers = []
     threads = []
-    worker_rng = random.Random(rng.randrange(SEED_RANGE))
     for i in range(num_threads):
         weights: list[float]
         if complexity == Complexity.DDL:
@@ -140,6 +139,7 @@ def run(
             weights = [60, 30, 0, 0, 0]
         else:
             raise ValueError(f"Unknown complexity {complexity}")
+        worker_rng = random.Random(rng.randrange(SEED_RANGE))
         action_list = worker_rng.choices(
             [
                 read_action_list,
@@ -176,6 +176,7 @@ def run(
         threads.append(thread)
 
     if scenario == Scenario.Cancel:
+        worker_rng = random.Random(rng.randrange(SEED_RANGE))
         worker = Worker(
             worker_rng,
             [CancelAction(worker_rng, workers)],
@@ -193,6 +194,7 @@ def run(
         thread.start()
         threads.append(thread)
     elif scenario == Scenario.Kill:
+        worker_rng = random.Random(rng.randrange(SEED_RANGE))
         assert composition, "Kill scenario only works in mzcompose"
         worker = Worker(
             worker_rng,
@@ -211,6 +213,7 @@ def run(
         thread.start()
         threads.append(thread)
     elif scenario == Scenario.BackupRestore:
+        worker_rng = random.Random(rng.randrange(SEED_RANGE))
         assert composition, "Backup & Restore scenario only works in mzcompose"
         worker = Worker(
             worker_rng,

From a1ba25382b279afa164237f7d964a9ad21377162 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Fri, 27 Oct 2023 22:58:15 +0000
Subject: [PATCH 17/17] Workaround for 22717

---
 .../materialize/parallel_workload/parallel_workload.py       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/misc/python/materialize/parallel_workload/parallel_workload.py b/misc/python/materialize/parallel_workload/parallel_workload.py
index 2d2efe766b3f0..ff31d19ed6c7f 100644
--- a/misc/python/materialize/parallel_workload/parallel_workload.py
+++ b/misc/python/materialize/parallel_workload/parallel_workload.py
@@ -67,7 +67,7 @@ def run(
     random.seed(seed)
 
     print(
-        f"--- Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})"
+        f"+++ Running with: --seed={seed} --threads={num_threads} --runtime={runtime} --complexity={complexity.value} --scenario={scenario.value} {'--naughty-identifiers ' if naughty_identifiers else ''}(--host={host})"
     )
     initialize_logging()
 
@@ -272,7 +272,8 @@ def run(
             if thread.is_alive():
                 print(f"{thread.name} still running: {worker.exe.last_log}")
         print("Threads have not stopped within 5 minutes, exiting hard")
-        os._exit(1)
+        # TODO(def-): Switch to failing exit code when https://github.com/MaterializeInc/materialize/issues/22717 is fixed
+        os._exit(0)
 
     conn = pg8000.connect(host=host, port=ports["materialized"], user="materialize")
     conn.autocommit = True