diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java index 80a9b21b33f..fc5a52f2393 100644 --- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java +++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java @@ -382,9 +382,6 @@ public enum Property { MANAGER_WALOG_CLOSER_IMPLEMETATION("manager.walog.closer.implementation", "org.apache.accumulo.server.manager.recovery.HadoopLogCloser", PropertyType.CLASSNAME, "A class that implements a mechanism to steal write access to a write-ahead log.", "1.5.0"), - @Deprecated - MANAGER_FATE_METRICS_ENABLED("manager.fate.metrics.enabled", "true", PropertyType.BOOLEAN, - "Enable reporting of FATE metrics in JMX (and logging with Hadoop Metrics2).", "1.9.3"), MANAGER_FATE_METRICS_MIN_UPDATE_INTERVAL("manager.fate.metrics.min.update.interval", "60s", PropertyType.TIMEDURATION, "Limit calls from metric sinks to zookeeper to update interval.", "1.9.3"), @@ -924,9 +921,6 @@ public enum Property { + " and possibly compacted. Legal values are: compact - which both flushes and compacts the" + " metadata; flush - which flushes only (compactions may be triggered if required); or none.", "1.10.0"), - @Deprecated - GC_METRICS_ENABLED("gc.metrics.enabled", "true", PropertyType.BOOLEAN, - "Enable detailed gc metrics reporting with hadoop metrics.", "1.10.0"), // properties that are specific to the monitor server behavior MONITOR_PREFIX("monitor.", null, PropertyType.PREFIX, diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 954e6d98c35..3a368ef1c7f 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -314,37 +314,6 @@ * * * - * queries - * Gauge - * {@value #METRICS_TSERVER_QUERIES} - * Gauge - * - * - * - * scannedRate - * Gauge - * {@value #METRICS_TSERVER_SCANNED_ENTRIES} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * - * queryRate - * Gauge - * {@value #METRICS_TSERVER_SCAN_RESULTS} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * - * queryByteRate - * Gauge - * {@value #METRICS_TSERVER_SCAN_RESULTS_BYTES} - * Gauge - * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be - * derived - * - * * ingestRate * Gauge * {@value #METRICS_TSERVER_INGEST_MUTATIONS} @@ -367,6 +336,28 @@ * Gauge * * + * + * + * N/A + * N/A + * {@value #METRICS_SCAN_RESERVATION_TIMER} + * Timer + * Time to reserve a tablets files for scan + * + * + * N/A + * N/A + * {@value #METRICS_SCAN_BUSY_TIMEOUT_COUNTER} + * Counter + * Count of the scans where a busy timeout happened + * + * + * N/A + * N/A + * {@value #METRICS_SCAN_TABLET_METADATA_CACHE} + * Cache + * scan server tablet cache metrics + * * * * scan @@ -418,12 +409,36 @@ * * * - * N/A - * N/A - * {@value #METRICS_SCAN_BUSY_TIMEOUT} - * Counter + * queries + * Gauge + * {@value #METRICS_SCAN_QUERIES} + * Gauge * * + * + * scannedRate + * Gauge + * {@value #METRICS_SCAN_SCANNED_ENTRIES} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * + * + * queryRate + * Gauge + * {@value #METRICS_SCAN_QUERY_SCAN_RESULTS} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * + * + * queryByteRate + * Gauge + * {@value #METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES} + * Gauge + * Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be + * derived + * * * * {i|e}_{compactionServiceName}_{executor_name}_queued @@ -605,7 +620,7 @@ public interface MetricsProducer { String METRICS_REPLICATION_PEERS = METRICS_REPLICATION_PREFIX + "peers"; String METRICS_REPLICATION_THREADS = METRICS_REPLICATION_PREFIX + "threads"; - String METRICS_SCAN_PREFIX = "accumulo.tserver.scans."; + String METRICS_SCAN_PREFIX = "accumulo.scan."; String METRICS_SCAN_TIMES = METRICS_SCAN_PREFIX + "times"; String METRICS_SCAN_OPEN_FILES = METRICS_SCAN_PREFIX + "files.open"; String METRICS_SCAN_RESULTS = METRICS_SCAN_PREFIX + "result"; @@ -613,7 +628,14 @@ public interface MetricsProducer { String METRICS_SCAN_START = METRICS_SCAN_PREFIX + "start"; String METRICS_SCAN_CONTINUE = METRICS_SCAN_PREFIX + "continue"; String METRICS_SCAN_CLOSE = METRICS_SCAN_PREFIX + "close"; - String METRICS_SCAN_BUSY_TIMEOUT = METRICS_SCAN_PREFIX + "busy.timeout"; + String METRICS_SCAN_BUSY_TIMEOUT_COUNTER = METRICS_SCAN_PREFIX + "busy.timeout.count"; + String METRICS_SCAN_RESERVATION_TIMER = METRICS_SCAN_PREFIX + "reservation.timer"; + String METRICS_SCAN_QUERIES = METRICS_SCAN_PREFIX + "queries"; + String METRICS_SCAN_QUERY_SCAN_RESULTS = METRICS_SCAN_PREFIX + "query.results"; + String METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES = METRICS_SCAN_PREFIX + "query.results.bytes"; + String METRICS_SCAN_SCANNED_ENTRIES = METRICS_SCAN_PREFIX + "query.scanned.entries"; + + String METRICS_SCAN_TABLET_METADATA_CACHE = METRICS_SCAN_PREFIX + "tablet.metadata.cache"; String METRICS_TSERVER_PREFIX = "accumulo.tserver."; String METRICS_TSERVER_ENTRIES = METRICS_TSERVER_PREFIX + "entries"; @@ -629,14 +651,10 @@ public interface MetricsProducer { String METRICS_TSERVER_TABLETS_ONLINE = METRICS_TSERVER_PREFIX + "tablets.online"; String METRICS_TSERVER_TABLETS_OPENING = METRICS_TSERVER_PREFIX + "tablets.opening"; String METRICS_TSERVER_TABLETS_UNOPENED = METRICS_TSERVER_PREFIX + "tablets.unopened"; - String METRICS_TSERVER_QUERIES = METRICS_TSERVER_PREFIX + "queries"; String METRICS_TSERVER_TABLETS_FILES = METRICS_TSERVER_PREFIX + "tablets.files"; String METRICS_TSERVER_HOLD = METRICS_TSERVER_PREFIX + "hold"; String METRICS_TSERVER_INGEST_MUTATIONS = METRICS_TSERVER_PREFIX + "ingest.mutations"; String METRICS_TSERVER_INGEST_BYTES = METRICS_TSERVER_PREFIX + "ingest.bytes"; - String METRICS_TSERVER_SCAN_RESULTS = METRICS_TSERVER_PREFIX + "scan.results"; - String METRICS_TSERVER_SCAN_RESULTS_BYTES = METRICS_TSERVER_PREFIX + "scan.results.bytes"; - String METRICS_TSERVER_SCANNED_ENTRIES = METRICS_TSERVER_PREFIX + "scan.scanned.entries"; String METRICS_THRIFT_PREFIX = "accumulo.thrift."; String METRICS_THRIFT_EXECUTE = METRICS_THRIFT_PREFIX + "execute"; @@ -673,7 +691,7 @@ default Map getMetricFields() { fields.put((String) f.get(MetricsProducer.class), f.getName()); } catch (IllegalArgumentException | IllegalAccessException e) { // this shouldn't happen, but let's log it anyway - LOG.error("Error getting metric value for field: " + f.getName()); + LOG.error("Error getting metric value for field: {}", f.getName()); } } } diff --git a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java index 2b0dee53b7e..9ac4ddd27a5 100644 --- a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java +++ b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/CompactionCoordinator.java @@ -100,6 +100,9 @@ public class CompactionCoordinator extends AbstractServer implements CompactionCoordinatorService.Iface, LiveTServerSet.Listener { private static final Logger LOG = LoggerFactory.getLogger(CompactionCoordinator.class); + + private static final Logger STATUS_LOG = + LoggerFactory.getLogger(CompactionCoordinator.class.getName() + ".compaction.status"); private static final long TIME_BETWEEN_GC_CHECKS = 5000; protected static final QueueSummaries QUEUE_SUMMARIES = new QueueSummaries(); @@ -585,8 +588,8 @@ public void updateCompactionStatus(TInfo tinfo, TCredentials credentials, throw new AccumuloSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED).asThriftException(); } - LOG.debug("Compaction status update, id: {}, timestamp: {}, update: {}", externalCompactionId, - timestamp, update); + STATUS_LOG.debug("Compaction status update, id: {}, timestamp: {}, update: {}", + externalCompactionId, timestamp, update); final RunningCompaction rc = RUNNING_CACHE.get(ExternalCompactionId.of(externalCompactionId)); if (null != rc) { rc.addUpdate(timestamp, update); diff --git a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/DeadCompactionDetector.java b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/DeadCompactionDetector.java index ba4a575ddfe..b58f06a31ee 100644 --- a/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/DeadCompactionDetector.java +++ b/server/compaction-coordinator/src/main/java/org/apache/accumulo/coordinator/DeadCompactionDetector.java @@ -117,8 +117,15 @@ private void detectDeadCompactions() { }); tabletCompactions.forEach((ecid, extent) -> { - log.debug("Possible dead compaction detected {} {}", ecid, extent); - this.deadCompactions.merge(ecid, 1L, Long::sum); + var count = this.deadCompactions.merge(ecid, 1L, Long::sum); + if (count == 1) { + // The first time a possible dead compaction is seen, for quick compactions there is a good + // chance that it is already complete instead of dead. In order to avoid spamming the logs + // w/ false positives, log the first seen at trace. + log.trace("Possible dead compaction detected {} {} {}", ecid, extent, count); + } else { + log.debug("Possible dead compaction detected {} {} {}", ecid, extent, count); + } }); // Everything left in tabletCompactions is no longer running anywhere and should be failed. diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java index 81744441aa6..216526d328c 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java @@ -344,12 +344,17 @@ public void run() { TServerConnection client = manager.tserverSet.getConnection(location.getServerInstance()); if (client != null) { - Manager.log.trace("[{}] Requesting TabletServer {} unload {} {}", store.name(), - location.getServerInstance(), tls.extent, goal.howUnload()); - client.unloadTablet(manager.managerLock, tls.extent, goal.howUnload(), - manager.getSteadyTime()); - unloaded++; - totalUnloaded++; + try { + Manager.log.trace("[{}] Requesting TabletServer {} unload {} {}", store.name(), + location.getServerInstance(), tls.extent, goal.howUnload()); + client.unloadTablet(manager.managerLock, tls.extent, goal.howUnload(), + manager.getSteadyTime()); + unloaded++; + totalUnloaded++; + } catch (TException tException) { + Manager.log.warn("[{}] Failed to request tablet unload {} {} {}", store.name(), + location.getServerInstance(), tls.extent, goal.howUnload(), tException); + } } else { Manager.log.warn("Could not connect to server {}", location); } @@ -1036,13 +1041,19 @@ private void flushChanges(TabletLists tLists, WalStateManager wals) } tLists.assignments.addAll(tLists.assigned); for (Assignment a : tLists.assignments) { - TServerConnection client = manager.tserverSet.getConnection(a.server); - if (client != null) { - client.assignTablet(manager.managerLock, a.tablet); - } else { - Manager.log.warn("Could not connect to server {}", a.server); + try { + TServerConnection client = manager.tserverSet.getConnection(a.server); + if (client != null) { + client.assignTablet(manager.managerLock, a.tablet); + manager.assignedTablet(a.tablet); + } else { + Manager.log.warn("Could not connect to server {} for assignment of {}", a.server, + a.tablet); + } + } catch (TException tException) { + Manager.log.warn("Could not connect to server {} for assignment of {}", a.server, a.tablet, + tException); } - manager.assignedTablet(a.tablet); } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index bcba26aa135..94186ed38f3 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -76,6 +76,7 @@ import org.apache.accumulo.core.tabletserver.thrift.ActiveScan; import org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException; import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException; +import org.apache.accumulo.core.tabletserver.thrift.ScanServerBusyException; import org.apache.accumulo.core.tabletserver.thrift.TSampleNotPresentException; import org.apache.accumulo.core.tabletserver.thrift.TSamplerConfiguration; import org.apache.accumulo.core.tabletserver.thrift.TabletScanClientService; @@ -122,6 +123,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import io.micrometer.core.instrument.Tag; + public class ScanServer extends AbstractServer implements TabletScanClientService.Iface, TabletHostingServer { @@ -199,6 +202,7 @@ private TabletMetadataLoader(Ample ample) { private volatile boolean serverStopRequested = false; private ServiceLock scanServerLock; protected TabletServerScanMetrics scanMetrics; + private ScanServerMetrics scanServerMetrics; private ZooCache managerLockCache; @@ -243,7 +247,7 @@ public ScanServer(ScanServerOpts opts, String[] args) { } tabletMetadataCache = Caffeine.newBuilder().expireAfterWrite(cacheExpiration, TimeUnit.MILLISECONDS) - .scheduler(Scheduler.systemScheduler()).build(tabletMetadataLoader); + .scheduler(Scheduler.systemScheduler()).recordStats().build(tabletMetadataLoader); } delegate = newThriftScanClientHandler(new WriteTracker()); @@ -338,6 +342,7 @@ public void unableToMonitorLockNode(final Exception e) { // Don't use the normal ServerServices lock content, instead put the server UUID here. byte[] lockContent = (serverLockUUID.toString() + "," + groupName).getBytes(UTF_8); + // wait for 120 seconds with 5 second delay for (int i = 0; i < 120 / 5; i++) { zoo.putPersistentData(zLockPath.toString(), new byte[0], NodeExistsPolicy.SKIP); @@ -371,10 +376,12 @@ public void run() { MetricsInfo metricsInfo = getContext().getMetricsInfo(); metricsInfo.addServiceTags(getApplicationName(), clientAddress); + metricsInfo.addCommonTags(List.of(Tag.of("resource.group", groupName))); scanMetrics = new TabletServerScanMetrics(); + scanServerMetrics = new ScanServerMetrics(tabletMetadataCache); - metricsInfo.addMetricsProducers(scanMetrics); + metricsInfo.addMetricsProducers(scanMetrics, scanServerMetrics); metricsInfo.init(); // We need to set the compaction manager so that we don't get an NPE in CompactableImpl.close @@ -657,6 +664,19 @@ private Map reserveFilesInner(Collection ex } } + @VisibleForTesting + ScanReservation reserveFilesInstrumented(Map> extents) + throws AccumuloException { + long start = System.nanoTime(); + try { + return reserveFiles(extents); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + + } + protected ScanReservation reserveFiles(Map> extents) throws AccumuloException { @@ -687,6 +707,16 @@ protected ScanReservation reserveFiles(Map> extents) return new ScanReservation(tabletsMetadata, myReservationId, failures); } + private ScanReservation reserveFilesInstrumented(long scanId) throws NoSuchScanIDException { + long start = System.nanoTime(); + try { + return reserveFiles(scanId); + } finally { + scanServerMetrics.getReservationTimer().record(System.nanoTime() - start, + TimeUnit.NANOSECONDS); + } + } + protected ScanReservation reserveFiles(long scanId) throws NoSuchScanIDException { var session = (ScanSession) sessionManager.getSession(scanId); if (session == null) { @@ -875,7 +905,7 @@ public InitialScan startScan(TInfo tinfo, TCredentials credentials, TKeyExtent t KeyExtent extent = getKeyExtent(textent); try (ScanReservation reservation = - reserveFiles(Map.of(extent, Collections.singletonList(range)))) { + reserveFilesInstrumented(Map.of(extent, Collections.singletonList(range)))) { if (reservation.getFailures().containsKey(textent)) { throw new NotServingTabletException(extent.toThrift()); @@ -889,7 +919,9 @@ batchTimeOut, classLoaderContext, executionHints, getScanTabletResolver(tablet), busyTimeout); return is; - + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (AccumuloException | IOException e) { LOG.error("Error starting scan", e); throw new RuntimeException(e); @@ -905,6 +937,9 @@ public ScanResult continueScan(TInfo tinfo, long scanID, long busyTimeout) try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } @@ -933,7 +968,7 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, batch.put(extent, entry.getValue()); } - try (ScanReservation reservation = reserveFiles(batch)) { + try (ScanReservation reservation = reserveFilesInstrumented(batch)) { HashMap tablets = new HashMap<>(); reservation.getTabletMetadataExtents().forEach(extent -> { @@ -950,6 +985,9 @@ public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, LOG.trace("started scan: {}", ims.getScanID()); return ims; + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } catch (TException e) { LOG.error("Error starting scan", e); throw e; @@ -967,6 +1005,9 @@ public MultiScanResult continueMultiScan(TInfo tinfo, long scanID, long busyTime try (ScanReservation reservation = reserveFiles(scanID)) { Preconditions.checkState(reservation.getFailures().isEmpty()); return delegate.continueMultiScan(tinfo, scanID, busyTimeout); + } catch (ScanServerBusyException be) { + scanServerMetrics.incrementBusy(); + throw be; } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java new file mode 100644 index 00000000000..1a516b597bb --- /dev/null +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.tserver; + +import org.apache.accumulo.core.dataImpl.KeyExtent; +import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.metrics.MetricsProducer; + +import com.github.benmanes.caffeine.cache.LoadingCache; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import io.micrometer.core.instrument.binder.cache.CaffeineCacheMetrics; + +public class ScanServerMetrics implements MetricsProducer { + + private Timer reservationTimer; + private Counter busyTimeoutCount; + + private final LoadingCache tabletMetadataCache; + + public ScanServerMetrics(final LoadingCache tabletMetadataCache) { + this.tabletMetadataCache = tabletMetadataCache; + } + + @Override + public void registerMetrics(MeterRegistry registry) { + reservationTimer = Timer.builder(MetricsProducer.METRICS_SCAN_RESERVATION_TIMER) + .description("Time to reserve a tablets files for scan").register(registry); + busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) + .description("The number of scans where a busy timeout happened").register(registry); + CaffeineCacheMetrics.monitor(registry, tabletMetadataCache, METRICS_SCAN_TABLET_METADATA_CACHE); + } + + public Timer getReservationTimer() { + return reservationTimer; + } + + public void incrementBusy() { + busyTimeoutCount.increment(); + } +} diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java index d1ff17347ae..8a99b2315e4 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ThriftScanClientHandler.java @@ -284,7 +284,7 @@ protected ScanResult continueScan(TInfo tinfo, long scanID, SingleScanSession sc server.getSessionManager().removeSession(scanID); TabletBase tablet = scanSession.getTabletResolver().getTablet(scanSession.extent); if (busyTimeout > 0) { - server.getScanMetrics().incrementScanBusyTimeout(1.0D); + server.getScanMetrics().incrementBusy(1.0D); throw new ScanServerBusyException(); } else if (tablet == null || tablet.isClosed()) { throw new NotServingTabletException(scanSession.extent.toThrift()); @@ -495,7 +495,7 @@ private MultiScanResult continueMultiScan(long scanID, MultiScanSession session, } catch (CancellationException ce) { server.getSessionManager().removeSession(scanID); if (busyTimeout > 0) { - server.getScanMetrics().incrementScanBusyTimeout(1.0D); + server.getScanMetrics().incrementBusy(1.0D); throw new ScanServerBusyException(); } else { log.warn("Failed to get multiscan result", ce); diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerMetricsUtil.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerMetricsUtil.java index f508a6871ed..599065d7c76 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerMetricsUtil.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerMetricsUtil.java @@ -24,8 +24,6 @@ /** * Wrapper around extracting metrics from a TabletServer instance - * - * Necessary to support both old custom JMX metrics and Hadoop Metrics2 */ public class TabletServerMetricsUtil { diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java index 492a6a8c803..8e066dd7f71 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java @@ -39,7 +39,7 @@ public class TabletServerScanMetrics implements MetricsProducer { private Counter startScanCalls; private Counter continueScanCalls; private Counter closeScanCalls; - private Counter busyTimeoutReturned; + private Counter busyTimeoutCount; private final LongAdder lookupCount = new LongAdder(); private final LongAdder queryResultCount = new LongAdder(); @@ -114,8 +114,8 @@ public void incrementCloseScan(double value) { closeScanCalls.increment(value); } - public void incrementScanBusyTimeout(double value) { - busyTimeoutReturned.increment(value); + public void incrementBusy(double value) { + busyTimeoutCount.increment(value); } @Override @@ -133,17 +133,19 @@ public void registerMetrics(MeterRegistry registry) { .description("calls to continue a scan / multiscan").register(registry); closeScanCalls = Counter.builder(METRICS_SCAN_CLOSE) .description("calls to close a scan / multiscan").register(registry); - busyTimeoutReturned = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT) - .description("times that a scan has timed out in the queue").register(registry); - Gauge.builder(METRICS_TSERVER_QUERIES, this, TabletServerScanMetrics::getLookupCount) + busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) + .description("The number of scans where a busy timeout happened").register(registry); + Gauge.builder(METRICS_SCAN_QUERIES, this, TabletServerScanMetrics::getLookupCount) .description("Number of queries").register(registry); - Gauge.builder(METRICS_TSERVER_SCAN_RESULTS, this, TabletServerScanMetrics::getQueryResultCount) + Gauge + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS, this, + TabletServerScanMetrics::getQueryResultCount) .description("Query rate (entries/sec)").register(registry); Gauge - .builder(METRICS_TSERVER_SCAN_RESULTS_BYTES, this, + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES, this, TabletServerScanMetrics::getQueryByteCount) .description("Query rate (bytes/sec)").register(registry); - Gauge.builder(METRICS_TSERVER_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) + Gauge.builder(METRICS_SCAN_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) .description("Scanned rate").register(registry); } diff --git a/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java b/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java index fdb79e1b00f..b7f64e05241 100644 --- a/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java +++ b/server/tserver/src/test/java/org/apache/accumulo/tserver/ScanServerTest.java @@ -101,6 +101,10 @@ protected ScanReservation reserveFiles(long scanId) throws NoSuchScanIDException return reservation; } + @Override + ScanReservation reserveFilesInstrumented(Map> extents) { + return reservation; + } } private ThriftScanClientHandler handler; diff --git a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java index aa7bae035a9..0782adb70df 100644 --- a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java +++ b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java @@ -100,9 +100,12 @@ public void confirmMetricsPublished() throws Exception { cluster.stop(); Set unexpectedMetrics = Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, - METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT); + METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT_COUNTER); + // add sserver as flaky until scan server included in mini tests. Set flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, - METRICS_MANAGER_BALANCER_NEED_MIGRATION, METRICS_MANAGER_BALANCER_MIGRATING); + METRICS_SCAN_BUSY_TIMEOUT_COUNTER, METRICS_SCAN_RESERVATION_TIMER, + METRICS_SCAN_TABLET_METADATA_CACHE, METRICS_MANAGER_BALANCER_NEED_MIGRATION, + METRICS_MANAGER_BALANCER_MIGRATING); Map expectedMetricNames = this.getMetricFields(); flakyMetrics.forEach(expectedMetricNames::remove); // might not see these