Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
mtweten authored Dec 3, 2024
2 parents 5162d3f + 658fb24 commit 96c937c
Show file tree
Hide file tree
Showing 31 changed files with 3,489 additions and 119 deletions.
2 changes: 1 addition & 1 deletion clients/cmd/fluentd/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COPY . /src/loki
WORKDIR /src/loki
RUN make BUILD_IN_CONTAINER=false fluentd-plugin

FROM fluent/fluentd:v1.17-debian-1
FROM fluent/fluentd:v1.18-debian-1
ENV LOKI_URL="https://logs-prod-us-central1.grafana.net"

COPY --from=build /src/loki/clients/cmd/fluentd/lib/fluent/plugin/out_loki.rb /fluentd/plugins/out_loki.rb
Expand Down
2 changes: 1 addition & 1 deletion clients/cmd/fluentd/docker/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

source 'https://rubygems.org'

gem 'fluentd', '1.17.1'
gem 'fluentd', '1.18.0'
gem 'fluent-plugin-multi-format-parser', '~>1.1.0'
1 change: 1 addition & 0 deletions docs/sources/send-data/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ By adding our output plugin you can quickly try Loki without doing big configura
These third-party clients also enable sending logs to Loki:

- [Cribl Loki Destination](https://docs.cribl.io/stream/destinations-loki)
- [GrafanaLokiLogger](https://github.com/antoniojmsjr/GrafanaLokiLogger) (Delphi/Lazarus)
- [ilogtail](https://github.com/alibaba/ilogtail) (Go)
- [Log4j2 appender for Loki](https://github.com/tkowalcz/tjahzi) (Java)
- [loki-logback-appender](https://github.com/loki4j/loki-logback-appender) (Java)
Expand Down
2 changes: 1 addition & 1 deletion docs/sources/send-data/otel/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ For ingesting logs to Loki using the OpenTelemetry Collector, you must use the [

When logs are ingested by Loki using an OpenTelemetry protocol (OTLP) ingestion endpoint, some of the data is stored as [Structured Metadata]({{< relref "../../get-started/labels/structured-metadata" >}}).

You must set `allow_structured_metadata` to `true` within your Loki config file. Otherwise, Loki will reject the log payload as malformed.
You must set `allow_structured_metadata` to `true` within your Loki config file. Otherwise, Loki will reject the log payload as malformed. Note that Structured Metadata is enabled by default in Loki 3.0 and later.

```yaml
limits_config:
Expand Down
8 changes: 4 additions & 4 deletions docs/sources/send-data/promtail/stages/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ type: Counter
[max_idle_duration: <string>]

config:
# If present and true all log lines will be counted without
# attempting to match the source to the extract map.
# If present and true all log lines will be counted without attempting
# to match the `value` to the field specified by `source` in the extracted map.
# It is an error to specify `match_all: true` and also specify a `value`
[match_all: <bool>]

Expand Down Expand Up @@ -231,7 +231,7 @@ This pipeline first tries to find text in the format `order_status=<value>` in
the log line, pulling out the `<value>` into the extracted map with the key
`order_status`.

The metric stages creates `successful_orders_total` and `failed_orders_total`
The metrics stage creates `successful_orders_total` and `failed_orders_total`
metrics that only increment when the value of `order_status` in the extracted
map is `success` or `fail` respectively.

Expand Down Expand Up @@ -265,7 +265,7 @@ number in the `retries` field from the extracted map.
- metrics:
http_response_time_seconds:
type: Histogram
description: "length of each log line"
description: "distribution of log response time"
source: response_time
config:
buckets: [0.001,0.0025,0.005,0.010,0.025,0.050]
Expand Down
20 changes: 20 additions & 0 deletions docs/sources/shared/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,26 @@ block_builder:
# CLI flag: -blockbuilder.backoff..backoff-retries
[max_retries: <int> | default = 10]

block_scheduler:
# Consumer group used by block scheduler to track the last consumed offset.
# CLI flag: -block-scheduler.consumer-group
[consumer_group: <string> | default = "block-scheduler"]

# How often the scheduler should plan jobs.
# CLI flag: -block-scheduler.interval
[interval: <duration> | default = 5m]

# Period used by the planner to calculate the start and end offset such that
# each job consumes records spanning the target period.
# CLI flag: -block-scheduler.target-records-spanning-period
[target_records_spanning_period: <duration> | default = 1h]

# Lookback period in milliseconds used by the scheduler to plan jobs when the
# consumer group has no commits. -1 consumes from the latest offset. -2
# consumes from the start of the partition.
# CLI flag: -block-scheduler.lookback-period
[lookback_period: <int> | default = -2]

pattern_ingester:
# Whether the pattern ingester is enabled.
# CLI flag: -pattern-ingester.enabled
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ require (
github.com/alicebob/miniredis/v2 v2.33.0
github.com/aliyun/aliyun-oss-go-sdk v3.0.2+incompatible
github.com/aws/aws-sdk-go v1.55.5
github.com/baidubce/bce-sdk-go v0.9.202
github.com/baidubce/bce-sdk-go v0.9.203
github.com/bmatcuk/doublestar/v4 v4.7.1
github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500
github.com/cespare/xxhash/v2 v2.3.0
Expand Down Expand Up @@ -76,7 +76,7 @@ require (
github.com/oklog/run v1.1.0
github.com/oklog/ulid v1.3.1 // indirect
github.com/opentracing-contrib/go-grpc v0.1.0
github.com/opentracing-contrib/go-stdlib v1.0.0
github.com/opentracing-contrib/go-stdlib v1.1.0
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b
github.com/oschwald/geoip2-golang v1.11.0
// github.com/pierrec/lz4 v2.0.5+incompatible
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1008,8 +1008,8 @@ github.com/aws/smithy-go v1.11.1 h1:IQ+lPZVkSM3FRtyaDox41R8YS6iwPMYIreejOgPW49g=
github.com/aws/smithy-go v1.11.1/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM=
github.com/axiomhq/hyperloglog v0.2.0 h1:u1XT3yyY1rjzlWuP6NQIrV4bRYHOaqZaovqjcBEvZJo=
github.com/axiomhq/hyperloglog v0.2.0/go.mod h1:GcgMjz9gaDKZ3G0UMS6Fq/VkZ4l7uGgcJyxA7M+omIM=
github.com/baidubce/bce-sdk-go v0.9.202 h1:TGRdO4g4CtiI2IZ6MxeUmkbKe6l8kq+mYH6SbxczO3g=
github.com/baidubce/bce-sdk-go v0.9.202/go.mod h1:zbYJMQwE4IZuyrJiFO8tO8NbtYiKTFTbwh4eIsqjVdg=
github.com/baidubce/bce-sdk-go v0.9.203 h1:D4YBk4prtlIjrnwrh5nvsSSjLjataApDmeL0fxvI/KU=
github.com/baidubce/bce-sdk-go v0.9.203/go.mod h1:zbYJMQwE4IZuyrJiFO8tO8NbtYiKTFTbwh4eIsqjVdg=
github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 h1:6df1vn4bBlDDo4tARvBm7l6KA9iVMnE3NWizDeWSrps=
github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3/go.mod h1:CIWtjkly68+yqLPbvwwR/fjNJA/idrtULjZWh2v1ys0=
github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM=
Expand Down Expand Up @@ -2309,8 +2309,8 @@ github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2sz
github.com/opentracing-contrib/go-grpc v0.1.0 h1:9JHDtQXv6UL0tFF8KJB/4ApJgeOcaHp1h07d0PJjESc=
github.com/opentracing-contrib/go-grpc v0.1.0/go.mod h1:i3/jx/TvJZ/HKidtT4XGIi/NosUEpzS9xjVJctbKZzI=
github.com/opentracing-contrib/go-observer v0.0.0-20170622124052-a52f23424492/go.mod h1:Ngi6UdF0k5OKD5t5wlmGhe/EDKPoUM3BXZSSfIuJbis=
github.com/opentracing-contrib/go-stdlib v1.0.0 h1:TBS7YuVotp8myLon4Pv7BtCBzOTo1DeZCld0Z63mW2w=
github.com/opentracing-contrib/go-stdlib v1.0.0/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU=
github.com/opentracing-contrib/go-stdlib v1.1.0 h1:cZBWc4pA4e65tqTJddbflK435S0tDImj6c9BMvkdUH0=
github.com/opentracing-contrib/go-stdlib v1.1.0/go.mod h1:S0p+X9p6dcBkoMTL+Qq2VOvxKs9ys5PpYWXWqlCS0bQ=
github.com/opentracing/basictracer-go v1.0.0/go.mod h1:QfBfYuafItcjQuMwinw9GhYKwFXS9KnPs5lxoYwgW74=
github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
Expand Down
80 changes: 80 additions & 0 deletions pkg/blockbuilder/scheduler/kafkautil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: AGPL-3.0-only

package scheduler

import (
"context"
"errors"
"fmt"
"sync"

"github.com/twmb/franz-go/pkg/kadm"
"github.com/twmb/franz-go/pkg/kerr"
)

// GetGroupLag is similar to `kadm.Client.Lag` but works when the group doesn't have live participants.
// Similar to `kadm.CalculateGroupLagWithStartOffsets`, it takes into account that the group may not have any commits.
//
// The lag is the difference between the last produced offset (high watermark) and an offset in the "past".
// If the block builder committed an offset for a given partition to the consumer group at least once, then
// the lag is the difference between the last produced offset and the offset committed in the consumer group.
// Otherwise, if the block builder didn't commit an offset for a given partition yet (e.g. block builder is
// running for the first time), then the lag is the difference between the last produced offset and fallbackOffsetMillis.
func GetGroupLag(ctx context.Context, admClient *kadm.Client, topic, group string, fallbackOffsetMillis int64) (kadm.GroupLag, error) {
offsets, err := admClient.FetchOffsets(ctx, group)
if err != nil {
if !errors.Is(err, kerr.GroupIDNotFound) {
return nil, fmt.Errorf("fetch offsets: %w", err)
}
}
if err := offsets.Error(); err != nil {
return nil, fmt.Errorf("fetch offsets got error in response: %w", err)
}

startOffsets, err := admClient.ListStartOffsets(ctx, topic)
if err != nil {
return nil, err
}
endOffsets, err := admClient.ListEndOffsets(ctx, topic)
if err != nil {
return nil, err
}

resolveFallbackOffsets := sync.OnceValues(func() (kadm.ListedOffsets, error) {
return admClient.ListOffsetsAfterMilli(ctx, fallbackOffsetMillis, topic)
})
// If the group-partition in offsets doesn't have a commit, fall back depending on where fallbackOffsetMillis points at.
for topic, pt := range startOffsets.Offsets() {
for partition, startOffset := range pt {
if _, ok := offsets.Lookup(topic, partition); ok {
continue
}
fallbackOffsets, err := resolveFallbackOffsets()
if err != nil {
return nil, fmt.Errorf("resolve fallback offsets: %w", err)
}
o, ok := fallbackOffsets.Lookup(topic, partition)
if !ok {
return nil, fmt.Errorf("partition %d not found in fallback offsets for topic %s", partition, topic)
}
if o.Offset < startOffset.At {
// Skip the resolved fallback offset if it's before the partition's start offset (i.e. before the earliest offset of the partition).
// This should not happen in Kafka, but can happen in Kafka-compatible systems, e.g. Warpstream.
continue
}
offsets.Add(kadm.OffsetResponse{Offset: kadm.Offset{
Topic: o.Topic,
Partition: o.Partition,
At: o.Offset,
LeaderEpoch: o.LeaderEpoch,
}})
}
}

descrGroup := kadm.DescribedGroup{
// "Empty" is the state that indicates that the group doesn't have active consumer members; this is always the case for block-builder,
// because we don't use group consumption.
State: "Empty",
}
return kadm.CalculateGroupLagWithStartOffsets(descrGroup, offsets, startOffsets, endOffsets), nil
}
164 changes: 164 additions & 0 deletions pkg/blockbuilder/scheduler/kafkautil_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
// SPDX-License-Identifier: AGPL-3.0-only

package scheduler

import (
"context"
"errors"
"testing"
"time"

"github.com/stretchr/testify/require"
"github.com/twmb/franz-go/pkg/kadm"
"github.com/twmb/franz-go/pkg/kgo"

"github.com/grafana/loki/v3/pkg/kafka/testkafka"
)

const (
testTopic = "test"
testGroup = "testgroup"
)

func TestKafkaGetGroupLag(t *testing.T) {
ctx, cancel := context.WithCancelCause(context.Background())
t.Cleanup(func() { cancel(errors.New("test done")) })

_, addr := testkafka.CreateClusterWithoutCustomConsumerGroupsSupport(t, 3, testTopic)
kafkaClient := mustKafkaClient(t, addr)
admClient := kadm.NewClient(kafkaClient)

const numRecords = 5

var producedRecords []kgo.Record
kafkaTime := time.Now().Add(-12 * time.Hour)
for i := int64(0); i < numRecords; i++ {
kafkaTime = kafkaTime.Add(time.Minute)

// Produce and keep records to partition 0.
res := produceRecords(ctx, t, kafkaClient, kafkaTime, "1", testTopic, 0, []byte(`test value`))
rec, err := res.First()
require.NoError(t, err)
require.NotNil(t, rec)

producedRecords = append(producedRecords, *rec)

// Produce same records to partition 1 (this partition won't have any commits).
produceRecords(ctx, t, kafkaClient, kafkaTime, "1", testTopic, 1, []byte(`test value`))
}
require.Len(t, producedRecords, numRecords)

// Commit last produced record from partition 0.
rec := producedRecords[len(producedRecords)-1]
offsets := make(kadm.Offsets)
offsets.Add(kadm.Offset{
Topic: rec.Topic,
Partition: rec.Partition,
At: rec.Offset + 1,
LeaderEpoch: rec.LeaderEpoch,
})
err := admClient.CommitAllOffsets(ctx, testGroup, offsets)
require.NoError(t, err)

// Truncate partition 1 after second to last record to emulate the retention
// Note Kafka sets partition's start offset to the requested offset. Any records within the segment before the requested offset can no longer be read.
// Note the difference between DeleteRecords and DeleteOffsets in kadm docs.
deleteRecOffsets := make(kadm.Offsets)
deleteRecOffsets.Add(kadm.Offset{
Topic: testTopic,
Partition: 1,
At: numRecords - 2,
})
_, err = admClient.DeleteRecords(ctx, deleteRecOffsets)
require.NoError(t, err)

getTopicPartitionLag := func(t *testing.T, lag kadm.GroupLag, topic string, part int32) int64 {
l, ok := lag.Lookup(topic, part)
require.True(t, ok)
return l.Lag
}

t.Run("fallbackOffset=milliseconds", func(t *testing.T) {
// get the timestamp of the last produced record
rec := producedRecords[len(producedRecords)-1]
fallbackOffset := rec.Timestamp.Add(-time.Millisecond).UnixMilli()
groupLag, err := GetGroupLag(ctx, admClient, testTopic, testGroup, fallbackOffset)
require.NoError(t, err)

require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 0), "partition 0 must have no lag")
require.EqualValues(t, 1, getTopicPartitionLag(t, groupLag, testTopic, 1), "partition 1 must fall back to known record and get its lag from there")
require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 2), "partition 2 has no data and must have no lag")
})

t.Run("fallbackOffset=before-earliest", func(t *testing.T) {
// get the timestamp of third to last produced record (record before earliest in partition 1)
rec := producedRecords[len(producedRecords)-3]
fallbackOffset := rec.Timestamp.Add(-time.Millisecond).UnixMilli()
groupLag, err := GetGroupLag(ctx, admClient, testTopic, testGroup, fallbackOffset)
require.NoError(t, err)

require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 0), "partition 0 must have no lag")
require.EqualValues(t, 2, getTopicPartitionLag(t, groupLag, testTopic, 1), "partition 1 must fall back to earliest and get its lag from there")
require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 2), "partition 2 has no data and must have no lag")
})

t.Run("fallbackOffset=0", func(t *testing.T) {
groupLag, err := GetGroupLag(ctx, admClient, testTopic, testGroup, 0)
require.NoError(t, err)

require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 0), "partition 0 must have no lag")
require.EqualValues(t, 2, getTopicPartitionLag(t, groupLag, testTopic, 1), "partition 1 must fall back to the earliest and get its lag from there")
require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 2), "partition 2 has no data and must have no lag")
})

t.Run("group=unknown", func(t *testing.T) {
groupLag, err := GetGroupLag(ctx, admClient, testTopic, "unknown", 0)
require.NoError(t, err)

// This group doesn't have any commits, so it must calc its lag from the fallback.
require.EqualValues(t, numRecords, getTopicPartitionLag(t, groupLag, testTopic, 0))
require.EqualValues(t, 2, getTopicPartitionLag(t, groupLag, testTopic, 1), "partition 1 must fall back to the earliest and get its lag from there")
require.EqualValues(t, 0, getTopicPartitionLag(t, groupLag, testTopic, 2), "partition 2 has no data and must have no lag")
})
}

func mustKafkaClient(t *testing.T, addrs ...string) *kgo.Client {
writeClient, err := kgo.NewClient(
kgo.SeedBrokers(addrs...),
kgo.AllowAutoTopicCreation(),
// We will choose the partition of each record.
kgo.RecordPartitioner(kgo.ManualPartitioner()),
)
require.NoError(t, err)
t.Cleanup(writeClient.Close)
return writeClient
}

func produceRecords(
ctx context.Context,
t *testing.T,
kafkaClient *kgo.Client,
ts time.Time,
userID string,
topic string,
part int32,
val []byte,
) kgo.ProduceResults {
rec := &kgo.Record{
Timestamp: ts,
Key: []byte(userID),
Value: val,
Topic: topic,
Partition: part, // samples in this batch are split between N partitions
}
produceResult := kafkaClient.ProduceSync(ctx, rec)
require.NoError(t, produceResult.FirstErr())
return produceResult
}

func commitOffset(ctx context.Context, t *testing.T, kafkaClient *kgo.Client, group string, offset kadm.Offset) {
offsets := make(kadm.Offsets)
offsets.Add(offset)
err := kadm.NewClient(kafkaClient).CommitAllOffsets(ctx, group, offsets)
require.NoError(t, err)
}
24 changes: 24 additions & 0 deletions pkg/blockbuilder/scheduler/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package scheduler

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)

type Metrics struct {
lag *prometheus.GaugeVec
committedOffset *prometheus.GaugeVec
}

func NewMetrics(reg prometheus.Registerer) *Metrics {
return &Metrics{
lag: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Name: "loki_block_scheduler_group_lag",
Help: "How far behind the block scheduler consumer group is from the latest offset.",
}, []string{"partition"}),
committedOffset: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Name: "loki_block_scheduler_group_committed_offset",
Help: "The current offset the block scheduler consumer group is at.",
}, []string{"partition"}),
}
}
Loading

0 comments on commit 96c937c

Please sign in to comment.