Skip to content

Commit

Permalink
let MCV & histogram use table sample
Browse files Browse the repository at this point in the history
Signed-off-by: Murphy <[email protected]>
  • Loading branch information
murphyatwork committed Dec 2, 2024
1 parent 4e448af commit 5bdb1f8
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 14 deletions.
6 changes: 6 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/common/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -2126,6 +2126,9 @@ public class Config extends ConfigBase {
@ConfField(mutable = true)
public static long histogram_mcv_size = 100;

@ConfField(mutable = true, comment = "Use sampling to build column MCV. 0 means disable sampling")
public static double histogram_mcv_sample_ratio = 0.1;

/**
* default sample ratio of histogram statistics
*/
Expand All @@ -2138,6 +2141,9 @@ public class Config extends ConfigBase {
@ConfField(mutable = true)
public static long histogram_max_sample_row_count = 10000000;

@ConfField(mutable = true, comment = "Use table sample instead of row-level bernoulli sample")
public static boolean histogram_enable_table_sample = true;

@ConfField(mutable = true)
public static long connector_table_query_trigger_analyze_small_table_rows = 10000000; // 10M

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,25 @@ public class HistogramStatisticsCollectJob extends StatisticsCollectJob {
" histogram(`column_key`, cast($bucketNum as int), cast($sampleRatio as double)), " +
" $mcv," +
" NOW()" +
" FROM (SELECT $columnName as column_key FROM `$dbName`.`$tableName` where rand() <= $sampleRatio" +
" and $columnName is not null $MCVExclude" +
" ORDER BY $columnName LIMIT $totalRows) t";
" FROM (" +
" SELECT $columnName as column_key " +
" FROM `$dbName`.`$tableName` $sampleClause " +
" WHERE $randFilter and $columnName is not null $MCVExclude" +
" ORDER BY $columnName LIMIT $totalRows) t";

private static final String COLLECT_MCV_STATISTIC_TEMPLATE =
"select cast(version as INT), cast(db_id as BIGINT), cast(table_id as BIGINT), " +
"cast(column_key as varchar), cast(column_value as varchar) from (" +
"select " + StatsConstants.STATISTIC_HISTOGRAM_VERSION + " as version, " +
"$dbId as db_id, " +
"$tableId as table_id, " +
"$columnName as column_key, " +
"count($columnName) as column_value " +
"from `$dbName`.`$tableName` where $columnName is not null " +
"group by $columnName " +
"order by count($columnName) desc limit $topN ) t";
"SELECT " +
StatsConstants.STATISTIC_HISTOGRAM_VERSION + " as version, " +
" $dbId as db_id, " +
" $tableId as table_id, " +
" $columnName as column_key, " +
" count($columnName) as column_value " +
"FROM `$dbName`.`$tableName` $sampleClause " +
"WHERE $columnName is not null " +
"GROUP BY $columnName " +
"ORDER BY count($columnName) desc limit $topN ) t";

public HistogramStatisticsCollectJob(Database db, Table table, List<String> columnNames, List<Type> columnTypes,
StatsConstants.ScheduleType scheduleType, Map<String, String> properties) {
Expand All @@ -75,13 +79,19 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws
for (int i = 0; i < columnNames.size(); i++) {
String columnName = columnNames.get(i);
Type columnType = columnTypes.get(i);
String sql = buildCollectMCV(db, table, mcvSize, columnName);
String sql = buildCollectMCV(db, table, mcvSize, columnName, sampleRatio);
StatisticExecutor statisticExecutor = new StatisticExecutor();
List<TStatisticData> mcv = statisticExecutor.queryMCV(context, sql);

Map<String, String> mostCommonValues = new HashMap<>();
for (TStatisticData tStatisticData : mcv) {
mostCommonValues.put(tStatisticData.columnName, tStatisticData.histogram);
if (sampleRatio > 0.0 && sampleRatio < 1.0) {
long count = Long.parseLong(tStatisticData.histogram);
count = (long) (1.0 * count / sampleRatio);
mostCommonValues.put(tStatisticData.columnName, String.valueOf(count));
} else {
mostCommonValues.put(tStatisticData.columnName, tStatisticData.histogram);
}
}

sql = buildCollectHistogram(db, table, sampleRatio, bucketNum, mostCommonValues, columnName, columnType);
Expand All @@ -93,7 +103,7 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws
}
}

private String buildCollectMCV(Database database, Table table, Long topN, String columnName) {
private String buildCollectMCV(Database database, Table table, Long topN, String columnName, double sampleRatio) {
VelocityContext context = new VelocityContext();
context.put("tableId", table.getId());
context.put("columnName", StatisticUtils.quoting(table, columnName));
Expand All @@ -103,6 +113,13 @@ private String buildCollectMCV(Database database, Table table, Long topN, String
context.put("tableName", table.getName());
context.put("topN", topN);

if (sampleRatio > 0.0 && sampleRatio < 1.0) {
String sample = String.format("SAMPLE('percent'='%d')", (int) (sampleRatio * 100));
context.put("sampleClause", sample);
} else {
context.put("sampleClause", "");
}

return build(context, COLLECT_MCV_STATISTIC_TEMPLATE);
}

Expand All @@ -124,6 +141,17 @@ private String buildCollectHistogram(Database database, Table table, double samp
context.put("sampleRatio", sampleRatio);
context.put("totalRows", Config.histogram_max_sample_row_count);

// TODO: use it by default and remove this switch
if (Config.histogram_enable_table_sample) {
String sampleClause = String.format("SAMPLE('percent'='%d')", (int) (sampleRatio * 100));
context.put("sampleClause", sampleClause);
context.put("randFilter", "TRUE");
} else {
String randFilter = String.format(" rand() < %f", sampleRatio);
context.put("randFilter", randFilter);
context.put("sampleClause", "");
}

List<String> mcvList = new ArrayList<>();
for (Map.Entry<String, String> entry : mostCommonValues.entrySet()) {
mcvList.add("[\"" + entry.getKey() + "\",\"" + entry.getValue() + "\"]");
Expand Down

0 comments on commit 5bdb1f8

Please sign in to comment.