let MCV & histogram use table sample

Signed-off-by: Murphy <[email protected]>
murphyatwork · Dec 2, 2024 · 5bdb1f8 · 5bdb1f8
1 parent 4e448af
commit 5bdb1f8
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 14 deletions.
diff --git a/fe/fe-core/src/main/java/com/starrocks/common/Config.java b/fe/fe-core/src/main/java/com/starrocks/common/Config.java
@@ -2126,6 +2126,9 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true)
     public static long histogram_mcv_size = 100;
 
+    @ConfField(mutable = true, comment = "Use sampling to build column MCV. 0 means disable sampling")
+    public static double histogram_mcv_sample_ratio = 0.1;
+
     /**
      * default sample ratio of histogram statistics
      */
@@ -2138,6 +2141,9 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true)
     public static long histogram_max_sample_row_count = 10000000;
 
+    @ConfField(mutable = true, comment = "Use table sample instead of row-level bernoulli sample")
+    public static boolean histogram_enable_table_sample = true;
+
     @ConfField(mutable = true)
     public static long connector_table_query_trigger_analyze_small_table_rows = 10000000; // 10M
 

diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/HistogramStatisticsCollectJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/HistogramStatisticsCollectJob.java
@@ -38,21 +38,25 @@ public class HistogramStatisticsCollectJob extends StatisticsCollectJob {
                     " histogram(`column_key`, cast($bucketNum as int), cast($sampleRatio as double)), " +
                     " $mcv," +
                     " NOW()" +
-                    " FROM (SELECT $columnName as column_key FROM `$dbName`.`$tableName` where rand() <= $sampleRatio" +
-                    " and $columnName is not null $MCVExclude" +
-                    " ORDER BY $columnName LIMIT $totalRows) t";
+                    " FROM (" +
+                    "   SELECT $columnName as column_key " +
+                    "   FROM `$dbName`.`$tableName` $sampleClause " +
+                    "   WHERE $randFilter and $columnName is not null $MCVExclude" +
+                    "   ORDER BY $columnName LIMIT $totalRows) t";
 
     private static final String COLLECT_MCV_STATISTIC_TEMPLATE =
             "select cast(version as INT), cast(db_id as BIGINT), cast(table_id as BIGINT), " +
                     "cast(column_key as varchar), cast(column_value as varchar) from (" +
-                    "select " + StatsConstants.STATISTIC_HISTOGRAM_VERSION + " as version, " +
-                    "$dbId as db_id, " +
-                    "$tableId as table_id, " +
-                    "$columnName as column_key, " +
-                    "count($columnName) as column_value " +
-                    "from `$dbName`.`$tableName` where $columnName is not null " +
-                    "group by $columnName " +
-                    "order by count($columnName) desc limit $topN ) t";
+                    "SELECT " +
+                    StatsConstants.STATISTIC_HISTOGRAM_VERSION + " as version, " +
+                    "   $dbId as db_id, " +
+                    "   $tableId as table_id, " +
+                    "   $columnName as column_key, " +
+                    "   count($columnName) as column_value " +
+                    "FROM `$dbName`.`$tableName` $sampleClause " +
+                    "WHERE $columnName is not null " +
+                    "GROUP BY $columnName " +
+                    "ORDER BY count($columnName) desc limit $topN ) t";
 
     public HistogramStatisticsCollectJob(Database db, Table table, List<String> columnNames, List<Type> columnTypes,
                                          StatsConstants.ScheduleType scheduleType, Map<String, String> properties) {
@@ -75,13 +79,19 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws
         for (int i = 0; i < columnNames.size(); i++) {
             String columnName = columnNames.get(i);
             Type columnType = columnTypes.get(i);
-            String sql = buildCollectMCV(db, table, mcvSize, columnName);
+            String sql = buildCollectMCV(db, table, mcvSize, columnName, sampleRatio);
             StatisticExecutor statisticExecutor = new StatisticExecutor();
             List<TStatisticData> mcv = statisticExecutor.queryMCV(context, sql);
 
             Map<String, String> mostCommonValues = new HashMap<>();
             for (TStatisticData tStatisticData : mcv) {
-                mostCommonValues.put(tStatisticData.columnName, tStatisticData.histogram);
+                if (sampleRatio > 0.0 && sampleRatio < 1.0) {
+                    long count = Long.parseLong(tStatisticData.histogram);
+                    count = (long) (1.0 * count / sampleRatio);
+                    mostCommonValues.put(tStatisticData.columnName, String.valueOf(count));
+                } else {
+                    mostCommonValues.put(tStatisticData.columnName, tStatisticData.histogram);
+                }
             }
 
             sql = buildCollectHistogram(db, table, sampleRatio, bucketNum, mostCommonValues, columnName, columnType);
@@ -93,7 +103,7 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws
         }
     }
 
-    private String buildCollectMCV(Database database, Table table, Long topN, String columnName) {
+    private String buildCollectMCV(Database database, Table table, Long topN, String columnName, double sampleRatio) {
         VelocityContext context = new VelocityContext();
         context.put("tableId", table.getId());
         context.put("columnName", StatisticUtils.quoting(table, columnName));
@@ -103,6 +113,13 @@ private String buildCollectMCV(Database database, Table table, Long topN, String
         context.put("tableName", table.getName());
         context.put("topN", topN);
 
+        if (sampleRatio > 0.0 && sampleRatio < 1.0) {
+            String sample = String.format("SAMPLE('percent'='%d')", (int) (sampleRatio * 100));
+            context.put("sampleClause", sample);
+        } else {
+            context.put("sampleClause", "");
+        }
+
         return build(context, COLLECT_MCV_STATISTIC_TEMPLATE);
     }
 
@@ -124,6 +141,17 @@ private String buildCollectHistogram(Database database, Table table, double samp
         context.put("sampleRatio", sampleRatio);
         context.put("totalRows", Config.histogram_max_sample_row_count);
 
+        // TODO: use it by default and remove this switch
+        if (Config.histogram_enable_table_sample) {
+            String sampleClause = String.format("SAMPLE('percent'='%d')", (int) (sampleRatio * 100));
+            context.put("sampleClause", sampleClause);
+            context.put("randFilter", "TRUE");
+        } else {
+            String randFilter = String.format(" rand() < %f", sampleRatio);
+            context.put("randFilter", randFilter);
+            context.put("sampleClause", "");
+        }
+
         List<String> mcvList = new ArrayList<>();
         for (Map.Entry<String, String> entry : mostCommonValues.entrySet()) {
             mcvList.add("[\"" + entry.getKey() + "\",\"" + entry.getValue() + "\"]");