From f02279df8c48a68acf56e138f400ad978ce1b047 Mon Sep 17 00:00:00 2001 From: Martin Date: Sat, 3 Feb 2024 04:04:30 +0800 Subject: [PATCH] martin2038: first submission (#665) * first double as int * - hashcode * JAVA_OPTS empty * native * native * CalculateAverage_melgenek https://questdb.io/blog/building-faster-hash-table-high-performance-sql-joins/#fastmap-internals * mvn formatting * jvm model * 10k name * 10k name * round mean * limit ChunkSize smaller than Integer.MAX_VALUE --------- Co-authored-by: martin.cong --- calculate_average_martin2038.sh | 30 ++ prepare_martin2038.sh | 26 ++ .../onebrc/CalculateAverage_martin2038.java | 337 ++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100755 calculate_average_martin2038.sh create mode 100755 prepare_martin2038.sh create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java diff --git a/calculate_average_martin2038.sh b/calculate_average_martin2038.sh new file mode 100755 index 000000000..c141e2bdb --- /dev/null +++ b/calculate_average_martin2038.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +if [ -f target/CalculateAverage_martin2038_image ]; then + echo "Picking up existing native image 'target/CalculateAverage_martin2038_image', delete the file to select JVM mode." 1>&2 + target/CalculateAverage_martin2038_image +else + + #JAVA_OPTS="--enable-preview" + echo "Chosing to run the app in JVM mode as no native image was found, use prepare_martin2038.sh to generate." 1>&2 + # JAVA_OPTS="-XX:-EnableJVMCI -Xms16g -Xmx16g -XX:+AlwaysPreTouch -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC" + JAVA_OPTS="" + java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_martin2038 + +fi diff --git a/prepare_martin2038.sh b/prepare_martin2038.sh new file mode 100755 index 000000000..cf8e83f77 --- /dev/null +++ b/prepare_martin2038.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Uncomment below to use sdk +source "$HOME/.sdkman/bin/sdkman-init.sh" +sdk use java 21.0.2-graal 1>&2 +## +#if [ ! -f target/CalculateAverage_martin2038 ]; then +# MAIN=dev.morling.onebrc.CalculateAverage_martin2038 +# NATIVE_IMAGE_OPTS="-H:+UnlockExperimentalVMOptions --initialize-at-build-time=$MAIN --gc=epsilon -O3 -march=native -R:MaxHeapSize=515m -H:-GenLoopSafepoints -H:InlineAllBonus=10 -H:-ParseRuntimeOptions" +# native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_martin2038_image $MAIN +#fi \ No newline at end of file diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java b/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java new file mode 100644 index 000000000..073f157c3 --- /dev/null +++ b/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java @@ -0,0 +1,337 @@ +/* + * Copyright 2023 The original authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.morling.onebrc; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel.MapMode; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class CalculateAverage_martin2038 { + + // private static final String FILE = "/Users/martin/Garden/blog/1BRC/1brc/./measurements.txt"; + + private static final String FILE = "./measurements.txt"; + + private static class MeasurementAggregator { + private int min = Integer.MAX_VALUE; + private int max = Integer.MIN_VALUE; + private long sum; + private int count; + + void update(int temp) { + update(1, temp, temp, temp); + } + + void update(int cnt, long sm, int min, int max) { + sum += sm; + count += cnt; + if (this.min > min) { + this.min = min; + } + if (this.max < max) { + this.max = max; + } + } + + void merge(MeasurementAggregator it) { + update(it.count, it.sum, it.min, it.max); + } + + public String toString() { + var mean = this.sum / 10.0 / this.count; + return (min / 10f) + "/" + Math.round(mean * 10) / 10f + "/" + (max / 10f); + } + } + + public static void main(String[] args) throws IOException { + + var file = new RandomAccessFile(FILE, "r"); + final int maxNameLength = 110; + var fc = file.getChannel(); + split(file).stream().parallel().map(ck -> { + // StrFastHashKey 比string快500ms + var map = new HashMap(200); + // var pb = System.currentTimeMillis(); + try { + var mb = fc.map(MapMode.READ_ONLY, ck.start, ck.length); + var buff = new byte[maxNameLength]; + while (mb.hasRemaining()) { + var name = readNextHashKey(buff, mb); + // var name = readNextString(buff, mb);// .intern(); + var temp = readNextInt10Times(buff, mb); + add2map(map, name, temp); + } + // long end = ck.start + ck.length; + // do { + // var name = readNext(file, ';', 30).intern(); + // var temp = Double.parseDouble(readNext(file, '\n', 6)); + // var agg = map.computeIfAbsent(name,it->new MeasurementAggregator()); + // agg.update(temp); + // }while (file.getFilePointer() { + + var sb = new StringBuilder(map.size() * 100); + sb.append('{'); + map.entrySet().stream().sorted(Map.Entry.comparingByKey()) + .forEachOrdered(kv -> sb.append(kv.getKey()).append('=').append(kv.getValue()).append(", ")); + sb.deleteCharAt(sb.length() - 1); + sb.setCharAt(sb.length() - 1, '}'); + var resultStr = sb.toString(); + System.out.println(resultStr); + // System.out.println(resultStr.hashCode()); + }); + + } + + static HashMap reduceMap(HashMap aMap, HashMap bMap) { + aMap.forEach((k, v) -> { + var b = bMap.get(k); + if (null == b) { + bMap.put(k, v); + } + else { + b.merge(v); + } + }); + return bMap; + } + + static void add2map(Map map, Key name, int temp) { + // 比computeIfAbsent 节约1秒 + var agg = map.get(name); + if (null == agg) { + agg = new MeasurementAggregator(); + map.put(name, agg); + } + // var agg = map.computeIfAbsent(name,it->new MeasurementAggregator()); + agg.update(temp); + } + + record FileChunk(long start, long length) { + } + + static List split(RandomAccessFile file) throws IOException { + long total = file.length(); + var threadNum = Math.max((int) (total / Integer.MAX_VALUE + 1), Runtime.getRuntime().availableProcessors()); + long avgChunkSize = total / threadNum; + // System.out.println(avgChunkSize +" \t avgChunkSize : INT/MAX \t"+Integer.MAX_VALUE); + // Exception in thread "main" java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE + // at java.base/sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:1183) + long lastStart = 0; + var list = new ArrayList(threadNum); + for (var i = 0; i < threadNum - 1; i++) { + var length = avgChunkSize; + file.seek(lastStart + length); + while (file.readByte() != '\n') { + // file.seek(lastStart+ ++length); + ++length; + } + // include the '\n' + length++; + list.add(new FileChunk(lastStart, length)); + lastStart += length; + if (lastStart >= total) { + return list; + } + } + list.add(new FileChunk(lastStart, total - lastStart)); + return list; + } + + static StrFastHashKey readNextHashKey(byte[] buf, MappedByteBuffer mb) { + int i = 1; + mb.get(buf, 0, i); + byte b; + while ((b = mb.get()) != ';') { + buf[i++] = b; + } + return new StrFastHashKey(buf, i); + } + + static String readNextString(byte[] buf, MappedByteBuffer mb) { + int i = 1; + mb.get(buf, 0, i); + byte b; + while ((b = mb.get()) != ';') { + buf[i++] = b; + } + return new String(buf, 0, i); + } + + // copy from CalculateAverage_3j5a + // 替换 Double.parse + // 时间 38秒 -> 5418 ms + static int readNextInt10Times(byte[] buf, MappedByteBuffer mb) { + final int min_number_len = 3; + int i = min_number_len; + mb.get(buf, 0, i); + byte b; + while ((b = mb.get()) != '\n') { + buf[i++] = b; + } + // -3.2 + var zeroAscii = '0'; + int temperature = buf[--i] - zeroAscii; + i--; // skipping dot + var base = 10; + while (i > 0) { + b = buf[--i]; + if (b == '-') { + temperature = -temperature; + } + else { + temperature = base * (b - zeroAscii) + temperature; + base *= base; + } + } + return temperature; + } + + // static String readNext(RandomAccessFile file, char endFlag,int initLength) throws IOException { + // StringBuilder input = new StringBuilder(initLength); + // int c = -1; + // //boolean eol = false; + // + // while (true) { + // c = file.read(); + // if( c == endFlag || c == -1) { + // break; + // } + // input.append((char)c); + // } + // + // //if ((c == -1) && (input.length() == 0)) { + // // return null; + // //} + // return input.toString(); + // } + + static class StrFastHashKey implements Comparable { + final byte[] name; + final int hash; + + String nameStr; + + StrFastHashKey(byte[] buf, int size) { + name = new byte[size]; + System.arraycopy(buf, 0, name, 0, size); + // hash = calculateHash(name, 0, size - 1); + // FNV1a save 100+ms than calculateHash + hash = hashFNV1a(name, size); + } + + @Override + public boolean equals(Object o) { + // if (this == o) {return true;} + // if (o == null || getClass() != o.getClass()) {return false;} + StrFastHashKey that = (StrFastHashKey) o; + return hash == that.hash && Arrays.equals(name, that.name); + } + + @Override + public int hashCode() { + return hash; + } + + @Override + public String toString() { + if (null == nameStr) { + nameStr = new String(name); + } + return nameStr; + } + + @Override + public int compareTo(StrFastHashKey o) { + return toString().compareTo(o.toString()); + } + } + + private static final VarHandle LONG_VIEW = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.nativeOrder()) + .withInvokeExactBehavior(); + private static final VarHandle INT_VIEW = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.nativeOrder()) + .withInvokeExactBehavior(); + + /** + * This is a prime number that gives pretty + * good hash distributions + * on the data in this challenge. + */ + private static final long RANDOM_PRIME = 0x7A646E4D; + + /** + * The hash calculation is inspired by + * QuestDB FastMap + */ + private static int calculateHash(byte[] buffer, int startPosition, int endPosition) { + long hash = 0; + + int position = startPosition; + for (; position + Long.BYTES <= endPosition; position += Long.BYTES) { + long value = (long) LONG_VIEW.get(buffer, position); + hash = hash * RANDOM_PRIME + value; + } + + if (position + Integer.BYTES <= endPosition) { + int value = (int) INT_VIEW.get(buffer, position); + hash = hash * RANDOM_PRIME + value; + position += Integer.BYTES; + } + + for (; position <= endPosition; position++) { + hash = hash * RANDOM_PRIME + buffer[position]; + } + hash = hash * RANDOM_PRIME; + return (int) hash ^ (int) (hash >>> 32); + } + + private static final int FNV1_32_INIT = 0x811c9dc5; + private static final int FNV1_PRIME_32 = 16777619; + + /** + * https://github.com/prasanthj/hasher/blob/master/src/main/java/hasher/FNV1a.java + * + * FNV1a 32 bit variant. + * + * @param data - input byte array + * @param length - length of array + * @return - hashcode + */ + public static int hashFNV1a(byte[] data, int length) { + int hash = FNV1_32_INIT; + for (int i = 0; i < length; i++) { + hash ^= (data[i] & 0xff); + hash *= FNV1_PRIME_32; + } + + return hash; + } +}