diff --git a/cli_flags.go b/cli_flags.go
index ac1ca806..182cd768 100644
--- a/cli_flags.go
+++ b/cli_flags.go
@@ -12,6 +12,7 @@ import (
 	"github.com/peterbourgon/ff/v3"
 
 	"go.opentelemetry.io/ebpf-profiler/internal/controller"
+	"go.opentelemetry.io/ebpf-profiler/support"
 	"go.opentelemetry.io/ebpf-profiler/tracer"
 )
 
@@ -24,6 +25,7 @@ const (
 	defaultProbabilisticThreshold = tracer.ProbabilisticThresholdMax
 	defaultProbabilisticInterval  = 1 * time.Minute
 	defaultArgSendErrorFrames     = false
+	defaultOffCPUThreshold        = support.OffCPUThresholdMax
 
 	// This is the X in 2^(n + x) where n is the default hardcoded map size value
 	defaultArgMapScaleFactor = 0
@@ -61,6 +63,11 @@ var (
 		"If zero, monotonic-realtime clock sync will be performed once, " +
 		"on agent startup, but not periodically."
 	sendErrorFramesHelp = "Send error frames (devfiler only, breaks Kibana)"
+	offCPUThresholdHelp = fmt.Sprintf("If set to a value between 1 and %d will enable "+
+		"off-cpu profiling: Every time an off-cpu entry point is hit, a random number between "+
+		"0 and %d is chosen. If the given threshold is greater than this random number, the "+
+		"off-cpu trace is collected and reported.",
+		support.OffCPUThresholdMax-1, support.OffCPUThresholdMax-1)
 )
 
 // Package-scope variable, so that conditionally compiled other components can refer
@@ -114,6 +121,9 @@ func parseArgs() (*controller.Config, error) {
 	fs.BoolVar(&args.VerboseMode, "verbose", false, verboseModeHelp)
 	fs.BoolVar(&args.Version, "version", false, versionHelp)
 
+	fs.UintVar(&args.OffCPUThreshold, "off-cpu-threshold",
+		defaultOffCPUThreshold, offCPUThresholdHelp)
+
 	fs.Usage = func() {
 		fs.PrintDefaults()
 	}
diff --git a/host/host.go b/host/host.go
index b6c2a4fc..81afb631 100644
--- a/host/host.go
+++ b/host/host.go
@@ -54,6 +54,8 @@ type Trace struct {
 	KTime            times.KTime
 	PID              libpf.PID
 	TID              libpf.PID
+	Origin           libpf.Origin
+	OffTime          uint64 // Time a task was off-cpu in nanoseconds.
 	APMTraceID       libpf.APMTraceID
 	APMTransactionID libpf.APMTransactionID
 	CPU              int
diff --git a/internal/controller/config.go b/internal/controller/config.go
index dfd96b91..9a6ece49 100644
--- a/internal/controller/config.go
+++ b/internal/controller/config.go
@@ -33,7 +33,8 @@ type Config struct {
 	// HostName is the name of the host.
 	HostName string
 	// IPAddress is the IP address of the host that sends data to CollAgentAddr.
-	IPAddress string
+	IPAddress       string
+	OffCPUThreshold uint
 
 	Reporter reporter.Reporter
 
diff --git a/internal/controller/controller.go b/internal/controller/controller.go
index 3b3f96da..cfaf4993 100644
--- a/internal/controller/controller.go
+++ b/internal/controller/controller.go
@@ -13,6 +13,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/internal/helpers"
 	"go.opentelemetry.io/ebpf-profiler/metrics"
 	"go.opentelemetry.io/ebpf-profiler/reporter"
+	"go.opentelemetry.io/ebpf-profiler/support"
 	"go.opentelemetry.io/ebpf-profiler/times"
 	"go.opentelemetry.io/ebpf-profiler/tracehandler"
 	"go.opentelemetry.io/ebpf-profiler/tracer"
@@ -124,6 +125,13 @@ func (c *Controller) Start(ctx context.Context) error {
 	}
 	log.Info("Attached tracer program")
 
+	if c.config.OffCPUThreshold < support.OffCPUThresholdMax {
+		if err := trc.StartOffCPUProfiling(); err != nil {
+			return fmt.Errorf("failed to start off-cpu profiling: %v", err)
+		}
+		log.Printf("Enabled off-cpu profiling")
+	}
+
 	if c.config.ProbabilisticThreshold < tracer.ProbabilisticThresholdMax {
 		trc.StartProbabilisticProfiling(ctx)
 		log.Printf("Enabled probabilistic profiling")
diff --git a/libpf/libpf.go b/libpf/libpf.go
index ac6a9f3f..b9b6884d 100644
--- a/libpf/libpf.go
+++ b/libpf/libpf.go
@@ -60,3 +60,6 @@ type Void struct{}
 // source line numbers associated with offsets in native code, or for source line numbers in
 // interpreted code.
 type SourceLineno uint64
+
+// Origin determines the source of a trace.
+type Origin int
diff --git a/libpf/symbol.go b/libpf/symbol.go
index cf5c6ec3..81fd1ce5 100644
--- a/libpf/symbol.go
+++ b/libpf/symbol.go
@@ -6,6 +6,7 @@ package libpf // import "go.opentelemetry.io/ebpf-profiler/libpf"
 import (
 	"fmt"
 	"sort"
+	"strings"
 )
 
 // SymbolValue represents the value associated with a symbol, e.g. either an
@@ -81,6 +82,17 @@ func (symmap *SymbolMap) LookupSymbol(symbolName SymbolName) (*Symbol, error) {
 	return nil, fmt.Errorf("symbol %v not present in map", symbolName)
 }
 
+// LookupSymbolByPrefix loops over all known symbols and returns the first symbol
+// that starts with the given prefix.
+func (symmap *SymbolMap) LookupSymbolByPrefix(prefix string) (*Symbol, error) {
+	for name, sym := range symmap.nameToSymbol {
+		if strings.HasPrefix(string(name), prefix) {
+			return sym, nil
+		}
+	}
+	return nil, fmt.Errorf("no symbol present that starts with '%s'", prefix)
+}
+
 // LookupSymbolAddress returns the address of a symbol.
 // Returns SymbolValueInvalid and error if not found.
 func (symmap *SymbolMap) LookupSymbolAddress(symbolName SymbolName) (SymbolValue, error) {
diff --git a/reporter/base_reporter.go b/reporter/base_reporter.go
index 00c3deb6..fc02c9af 100644
--- a/reporter/base_reporter.go
+++ b/reporter/base_reporter.go
@@ -13,6 +13,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/libpf/xsync"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/samples"
+	"go.opentelemetry.io/ebpf-profiler/support"
 )
 
 // baseReporter encapsulates shared behavior between all the available reporters.
@@ -35,7 +36,7 @@ type baseReporter struct {
 	cgroupv2ID *lru.SyncedLRU[libpf.PID, string]
 
 	// traceEvents stores reported trace events (trace metadata with frames and counts)
-	traceEvents xsync.RWMutex[map[samples.TraceAndMetaKey]*samples.TraceEvents]
+	traceEvents xsync.RWMutex[map[libpf.Origin]samples.KeyToEventMapping]
 
 	// hostmetadata stores metadata that is sent out with every request.
 	hostmetadata *lru.SyncedLRU[string, string]
@@ -97,8 +98,11 @@ func (*baseReporter) ReportMetrics(_ uint32, _ []uint32, _ []int64) {}
 func (*baseReporter) SupportsReportTraceEvent() bool { return true }
 
 func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta) {
-	traceEventsMap := b.traceEvents.WLock()
-	defer b.traceEvents.WUnlock(&traceEventsMap)
+	if meta.Origin != support.TraceOriginSampling && meta.Origin != support.TraceOriginOffCPU {
+		// At the moment only on-CPU and off-CPU traces are reported.
+		log.Errorf("Skip reporting trace for unexpected %d origin", meta.Origin)
+		return
+	}
 
 	var extraMeta any
 	if b.cfg.ExtraSampleAttrProd != nil {
@@ -121,13 +125,17 @@ func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta
 		ExtraMeta:      extraMeta,
 	}
 
-	if events, exists := (*traceEventsMap)[key]; exists {
+	traceEventsMap := b.traceEvents.WLock()
+	defer b.traceEvents.WUnlock(&traceEventsMap)
+
+	if events, exists := (*traceEventsMap)[meta.Origin][key]; exists {
 		events.Timestamps = append(events.Timestamps, uint64(meta.Timestamp))
-		(*traceEventsMap)[key] = events
+		events.OffTimes = append(events.OffTimes, meta.OffTime)
+		(*traceEventsMap)[meta.Origin][key] = events
 		return
 	}
 
-	(*traceEventsMap)[key] = &samples.TraceEvents{
+	(*traceEventsMap)[meta.Origin][key] = &samples.TraceEvents{
 		Files:              trace.Files,
 		Linenos:            trace.Linenos,
 		FrameTypes:         trace.FrameTypes,
@@ -135,6 +143,7 @@ func (b *baseReporter) ReportTraceEvent(trace *libpf.Trace, meta *TraceEventMeta
 		MappingEnds:        trace.MappingEnd,
 		MappingFileOffsets: trace.MappingFileOffsets,
 		Timestamps:         []uint64{uint64(meta.Timestamp)},
+		OffTimes:           []uint64{meta.OffTime},
 	}
 }
 
diff --git a/reporter/collector_reporter.go b/reporter/collector_reporter.go
index 24978f70..caf6dee1 100644
--- a/reporter/collector_reporter.go
+++ b/reporter/collector_reporter.go
@@ -16,6 +16,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/libpf/xsync"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/samples"
+	"go.opentelemetry.io/ebpf-profiler/support"
 )
 
 // Assert that we implement the full Reporter interface.
@@ -59,16 +60,20 @@ func NewCollector(cfg *Config, nextConsumer consumerprofiles.Profiles) (*Collect
 		return nil, err
 	}
 
+	originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2)
+	for _, origin := range []libpf.Origin{support.TraceOriginSampling,
+		support.TraceOriginOffCPU} {
+		originsMap[origin] = make(samples.KeyToEventMapping)
+	}
+
 	return &CollectorReporter{
 		baseReporter: &baseReporter{
-			cfg:        cfg,
-			name:       cfg.Name,
-			version:    cfg.Version,
-			pdata:      data,
-			cgroupv2ID: cgroupv2ID,
-			traceEvents: xsync.NewRWMutex(
-				map[samples.TraceAndMetaKey]*samples.TraceEvents{},
-			),
+			cfg:          cfg,
+			name:         cfg.Name,
+			version:      cfg.Version,
+			pdata:        data,
+			cgroupv2ID:   cgroupv2ID,
+			traceEvents:  xsync.NewRWMutex(originsMap),
 			hostmetadata: hostmetadata,
 			runLoop: &runLoop{
 				stopSignal: make(chan libpf.Void),
diff --git a/reporter/internal/pdata/generate.go b/reporter/internal/pdata/generate.go
index 3352f26c..18b98f5a 100644
--- a/reporter/internal/pdata/generate.go
+++ b/reporter/internal/pdata/generate.go
@@ -15,6 +15,7 @@ import (
 
 	"go.opentelemetry.io/ebpf-profiler/libpf"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/samples"
+	"go.opentelemetry.io/ebpf-profiler/support"
 )
 
 const (
@@ -24,14 +25,16 @@ const (
 
 // Generate generates a pdata request out of internal profiles data, to be
 // exported.
-func (p Pdata) Generate(events map[samples.TraceAndMetaKey]*samples.TraceEvents) pprofile.Profiles {
+func (p Pdata) Generate(events map[libpf.Origin]samples.KeyToEventMapping) pprofile.Profiles {
 	profiles := pprofile.NewProfiles()
 	rp := profiles.ResourceProfiles().AppendEmpty()
 	sp := rp.ScopeProfiles().AppendEmpty()
-	prof := sp.Profiles().AppendEmpty()
-	prof.SetProfileID(pprofile.ProfileID(mkProfileID()))
-	p.setProfile(events, prof)
-
+	for _, origin := range []libpf.Origin{support.TraceOriginSampling,
+		support.TraceOriginOffCPU} {
+		prof := sp.Profiles().AppendEmpty()
+		prof.SetProfileID(pprofile.ProfileID(mkProfileID()))
+		p.setProfile(origin, events[origin], prof)
+	}
 	return profiles
 }
 
@@ -48,6 +51,7 @@ func mkProfileID() []byte {
 // setProfile sets the data an OTLP profile with all collected samples up to
 // this moment.
 func (p *Pdata) setProfile(
+	origin libpf.Origin,
 	events map[samples.TraceAndMetaKey]*samples.TraceEvents,
 	profile pprofile.Profile,
 ) {
@@ -62,13 +66,23 @@ func (p *Pdata) setProfile(
 	funcMap[samples.FuncInfo{Name: "", FileName: ""}] = 0
 
 	st := profile.SampleType().AppendEmpty()
-	st.SetTypeStrindex(getStringMapIndex(stringMap, "samples"))
-	st.SetUnitStrindex(getStringMapIndex(stringMap, "count"))
-
-	pt := profile.PeriodType()
-	pt.SetTypeStrindex(getStringMapIndex(stringMap, "cpu"))
-	pt.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds"))
-	profile.SetPeriod(1e9 / int64(p.samplesPerSecond))
+	switch origin {
+	case support.TraceOriginSampling:
+		st.SetTypeStrindex(getStringMapIndex(stringMap, "samples"))
+		st.SetUnitStrindex(getStringMapIndex(stringMap, "count"))
+
+		pt := profile.PeriodType()
+		pt.SetTypeStrindex(getStringMapIndex(stringMap, "cpu"))
+		pt.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds"))
+
+		profile.SetPeriod(1e9 / int64(p.samplesPerSecond))
+	case support.TraceOriginOffCPU:
+		st.SetTypeStrindex(getStringMapIndex(stringMap, "events"))
+		st.SetUnitStrindex(getStringMapIndex(stringMap, "nanoseconds"))
+	default:
+		log.Errorf("Generating profile for unsupported origin %d", origin)
+		return
+	}
 
 	// Temporary lookup to reference existing Mappings.
 	fileIDtoMapping := make(map[libpf.FileID]int32)
@@ -85,7 +99,15 @@ func (p *Pdata) setProfile(
 		endTS = pcommon.Timestamp(traceInfo.Timestamps[len(traceInfo.Timestamps)-1])
 
 		sample.TimestampsUnixNano().FromRaw(traceInfo.Timestamps)
-		sample.Value().Append(1)
+
+		switch origin {
+		case support.TraceOriginSampling:
+			sample.Value().Append(1)
+		case support.TraceOriginOffCPU:
+			for _, offTime := range traceInfo.OffTimes {
+				sample.Value().Append(int64(offTime))
+			}
+		}
 
 		// Walk every frame of the trace.
 		for i := range traceInfo.FrameTypes {
diff --git a/reporter/internal/samples/samples.go b/reporter/internal/samples/samples.go
index e938a6eb..233a9ae3 100644
--- a/reporter/internal/samples/samples.go
+++ b/reporter/internal/samples/samples.go
@@ -12,6 +12,8 @@ type TraceEventMeta struct {
 	APMServiceName string
 	PID, TID       libpf.PID
 	CPU            int
+	Origin         libpf.Origin
+	OffTime        uint64
 }
 
 // TraceEvents holds known information about a trace.
@@ -23,6 +25,7 @@ type TraceEvents struct {
 	MappingEnds        []libpf.Address
 	MappingFileOffsets []uint64
 	Timestamps         []uint64 // in nanoseconds
+	OffTimes           []uint64 // in nanoseconds
 }
 
 // TraceAndMetaKey is the deduplication key for samples. This **must always**
@@ -43,6 +46,9 @@ type TraceAndMetaKey struct {
 	ExtraMeta any
 }
 
+// KeyToEventMapping supports temporary mapping traces to additional information.
+type KeyToEventMapping map[TraceAndMetaKey]*TraceEvents
+
 // AttrKeyValue is a helper to populate Profile.attribute_table.
 type AttrKeyValue[T string | int64] struct {
 	Key string
diff --git a/reporter/otlp_reporter.go b/reporter/otlp_reporter.go
index 4525f642..e4cc5828 100644
--- a/reporter/otlp_reporter.go
+++ b/reporter/otlp_reporter.go
@@ -23,6 +23,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/libpf/xsync"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/pdata"
 	"go.opentelemetry.io/ebpf-profiler/reporter/internal/samples"
+	"go.opentelemetry.io/ebpf-profiler/support"
 )
 
 // Assert that we implement the full Reporter interface.
@@ -86,16 +87,20 @@ func NewOTLP(cfg *Config) (*OTLPReporter, error) {
 		return nil, err
 	}
 
+	originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2)
+	for _, origin := range []libpf.Origin{support.TraceOriginSampling,
+		support.TraceOriginOffCPU} {
+		originsMap[origin] = make(samples.KeyToEventMapping)
+	}
+
 	return &OTLPReporter{
 		baseReporter: &baseReporter{
-			cfg:        cfg,
-			name:       cfg.Name,
-			version:    cfg.Version,
-			pdata:      data,
-			cgroupv2ID: cgroupv2ID,
-			traceEvents: xsync.NewRWMutex(
-				map[samples.TraceAndMetaKey]*samples.TraceEvents{},
-			),
+			cfg:          cfg,
+			name:         cfg.Name,
+			version:      cfg.Version,
+			pdata:        data,
+			cgroupv2ID:   cgroupv2ID,
+			traceEvents:  xsync.NewRWMutex(originsMap),
 			hostmetadata: hostmetadata,
 			runLoop: &runLoop{
 				stopSignal: make(chan libpf.Void),
@@ -165,7 +170,13 @@ func (r *OTLPReporter) Start(ctx context.Context) error {
 func (r *OTLPReporter) reportOTLPProfile(ctx context.Context) error {
 	traceEvents := r.traceEvents.WLock()
 	events := maps.Clone(*traceEvents)
+	originsMap := make(map[libpf.Origin]samples.KeyToEventMapping, 2)
 	clear(*traceEvents)
+	for _, origin := range []libpf.Origin{support.TraceOriginSampling,
+		support.TraceOriginOffCPU} {
+		originsMap[origin] = make(samples.KeyToEventMapping)
+	}
+	*traceEvents = originsMap
 	r.traceEvents.WUnlock(&traceEvents)
 
 	profiles := r.pdata.Generate(events)
diff --git a/support/ebpf/bpfdefs.h b/support/ebpf/bpfdefs.h
index 7171b3c2..1271f845 100644
--- a/support/ebpf/bpfdefs.h
+++ b/support/ebpf/bpfdefs.h
@@ -83,6 +83,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, unsigned long long fla
     (void *)BPF_FUNC_perf_event_output;
 static int (*bpf_get_stackid)(void *ctx, void *map, u64 flags) =
     (void *)BPF_FUNC_get_stackid;
+static unsigned long long (*bpf_get_prandom_u32)(void) =
+	(void *) BPF_FUNC_get_prandom_u32;
 
 __attribute__ ((format (printf, 1, 3)))
 static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
diff --git a/support/ebpf/dotnet_tracer.ebpf.c b/support/ebpf/dotnet_tracer.ebpf.c
index 7ebac864..9b6ca040 100644
--- a/support/ebpf/dotnet_tracer.ebpf.c
+++ b/support/ebpf/dotnet_tracer.ebpf.c
@@ -244,7 +244,7 @@ ErrorCode unwind_one_dotnet_frame(PerCPURecord *record, DotnetProcInfo *vi, bool
 // unwind_dotnet is the entry point for tracing when invoked from the native tracer
 // or interpreter dispatcher. It does not reset the trace object and will append the
 // dotnet stack frames to the trace object for the current CPU.
-SEC("perf_event/unwind_dotnet")
+static inline __attribute__((__always_inline__))
 int unwind_dotnet(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record) {
@@ -289,3 +289,4 @@ int unwind_dotnet(struct pt_regs *ctx) {
   DEBUG_PRINT("dotnet: tail call for next frame unwinder (%d) failed", unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_dotnet)
diff --git a/support/ebpf/extmaps.h b/support/ebpf/extmaps.h
index 5922d9ed..56c58a67 100644
--- a/support/ebpf/extmaps.h
+++ b/support/ebpf/extmaps.h
@@ -6,8 +6,9 @@
 #include "bpf_map.h"
 
 // References to map definitions in *.ebpf.c.
-extern bpf_map_def progs;
+extern bpf_map_def perf_progs;
 extern bpf_map_def per_cpu_records;
+extern bpf_map_def kernel_stackmap;
 extern bpf_map_def pid_page_to_mapping_info;
 extern bpf_map_def metrics;
 extern bpf_map_def report_events;
@@ -41,7 +42,6 @@ extern bpf_map_def exe_id_to_21_stack_deltas;
 extern bpf_map_def exe_id_to_22_stack_deltas;
 extern bpf_map_def exe_id_to_23_stack_deltas;
 extern bpf_map_def hotspot_procs;
-extern bpf_map_def kernel_stackmap;
 extern bpf_map_def dotnet_procs;
 extern bpf_map_def perl_procs;
 extern bpf_map_def php_procs;
diff --git a/support/ebpf/hotspot_tracer.ebpf.c b/support/ebpf/hotspot_tracer.ebpf.c
index 9ae1fdc7..54290e11 100644
--- a/support/ebpf/hotspot_tracer.ebpf.c
+++ b/support/ebpf/hotspot_tracer.ebpf.c
@@ -890,7 +890,7 @@ static ErrorCode hotspot_unwind_one_frame(PerCPURecord *record, HotspotProcInfo
 // unwind_hotspot is the entry point for tracing when invoked from the native tracer
 // and it recursive unwinds all HotSpot frames and then jumps back to unwind further
 // native frames that follow.
-SEC("perf_event/unwind_hotspot")
+static inline __attribute__((__always_inline__))
 int unwind_hotspot(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -927,3 +927,4 @@ int unwind_hotspot(struct pt_regs *ctx) {
   DEBUG_PRINT("jvm: tail call for next frame unwinder (%d) failed", unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_hotspot)
diff --git a/support/ebpf/integration_test.ebpf.c b/support/ebpf/integration_test.ebpf.c
index 510e72c6..dd01a060 100644
--- a/support/ebpf/integration_test.ebpf.c
+++ b/support/ebpf/integration_test.ebpf.c
@@ -80,10 +80,10 @@ void send_sample_traces(void *ctx, u64 pid, s32 kstack) {
   send_trace(ctx, trace);
 }
 
-// tracepoint__sched_switch fetches the current kernel stack ID from kernel_stackmap and
-// communicates it to userspace via kernel_stack_id map.
-SEC("tracepoint/sched/sched_switch")
-int tracepoint__sched_switch(void *ctx) {
+// tracepoint_integration__sched_switch fetches the current kernel stack ID from
+// kernel_stackmap and communicates it to userspace via kernel_stack_id map.
+SEC("tracepoint/integration/sched_switch")
+int tracepoint_integration__sched_switch(void *ctx) {
   u64 id = bpf_get_current_pid_tgid();
   u64 pid = id >> 32;
 
diff --git a/support/ebpf/interpreter_dispatcher.ebpf.c b/support/ebpf/interpreter_dispatcher.ebpf.c
index fbc5c598..fcb4b329 100644
--- a/support/ebpf/interpreter_dispatcher.ebpf.c
+++ b/support/ebpf/interpreter_dispatcher.ebpf.c
@@ -25,8 +25,8 @@ bpf_map_def SEC("maps") metrics = {
   .max_entries = metricID_Max,
 };
 
-// progs maps from a program ID to an eBPF program
-bpf_map_def SEC("maps") progs = {
+// perf_progs maps from a program ID to a perf eBPF program
+bpf_map_def SEC("maps") perf_progs = {
   .type = BPF_MAP_TYPE_PROG_ARRAY,
   .key_size = sizeof(u32),
   .value_size = sizeof(u32),
@@ -172,7 +172,8 @@ void maybe_add_apm_info(Trace *trace) {
               trace->apm_transaction_id.as_int, corr_buf.trace_flags);
 }
 
-SEC("perf_event/unwind_stop")
+// unwind_stop is the tail call destination for PROG_UNWIND_STOP.
+static inline __attribute__((__always_inline__))
 int unwind_stop(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -238,6 +239,7 @@ int unwind_stop(struct pt_regs *ctx) {
 
   return 0;
 }
+MULTI_USE_FUNC(unwind_stop)
 
 char _license[] SEC("license") = "GPL";
 // this number will be interpreted by the elf loader
diff --git a/support/ebpf/native_stack_trace.ebpf.c b/support/ebpf/native_stack_trace.ebpf.c
index 959099cb..c205440d 100644
--- a/support/ebpf/native_stack_trace.ebpf.c
+++ b/support/ebpf/native_stack_trace.ebpf.c
@@ -4,14 +4,6 @@
 #include "tracemgmt.h"
 #include "stackdeltatypes.h"
 
-#ifndef __USER32_CS
-  // defined in arch/x86/include/asm/segment.h
-  #define GDT_ENTRY_DEFAULT_USER32_CS  4
-  #define GDT_ENTRY_DEFAULT_USER_DS    5
-  #define __USER32_CS                 (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
-  #define __USER_DS                   (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
-#endif
-
 // Macro to create a map named exe_id_to_X_stack_deltas that is a nested maps with a fileID for the
 // outer map and an array as inner map that holds up to 2^X stack delta entries for the given fileID.
 #define STACK_DELTA_BUCKET(X)                                                            \
@@ -94,30 +86,6 @@ ErrorCode push_native(Trace *trace, u64 file, u64 line, bool return_address) {
   return _push_with_return_address(trace, file, line, FRAME_MARKER_NATIVE, return_address);
 }
 
-#ifdef __aarch64__
-// Strips the PAC tag from a pointer.
-//
-// While all pointers can contain PAC tags, we only apply this function to code pointers, because
-// that's where normalization is required to make the stack delta lookups work. Note that if that
-// should ever change, we'd need a different mask for the data pointers, because it might diverge
-// from the mask for code pointers.
-static inline u64 normalize_pac_ptr(u64 ptr) {
-  // Retrieve PAC mask from the system config.
-  u32 key = 0;
-  SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
-  if (!syscfg) {
-    // Unreachable: array maps are always fully initialized.
-    return ptr;
-  }
-
-  // Mask off PAC bits. Since we're always applying this to usermode pointers that should have all
-  // the high bits set to 0, we don't need to consider the case of having to fill up the resulting
-  // hole with 1s (like we'd have to for kernel ptrs).
-  ptr &= syscfg->inverse_pac_mask;
-  return ptr;
-}
-#endif
-
 // A single step for the bsearch into the big_stack_deltas array. This is really a textbook bsearch
 // step, built in a way to update the value of *lo and *hi. This function will be called repeatedly
 // (since we cannot do loops). The return value signals whether the bsearch came to an end / found
@@ -607,157 +575,8 @@ static ErrorCode unwind_one_frame(u64 pid, u32 frame_idx, struct UnwindState *st
   #error unsupported architecture
 #endif
 
-// Initialize state from pt_regs
-static inline ErrorCode copy_state_regs(UnwindState *state,
-                                        struct pt_regs *regs,
-                                        bool interrupted_kernelmode)
-{
-#if defined(__x86_64__)
-  // Check if the process is running in 32-bit mode on the x86_64 system.
-  // This check follows the Linux kernel implementation of user_64bit_mode() in
-  // arch/x86/include/asm/ptrace.h.
-  if (regs->cs == __USER32_CS) {
-    return ERR_NATIVE_X64_32BIT_COMPAT_MODE;
-  }
-  state->pc = regs->ip;
-  state->sp = regs->sp;
-  state->fp = regs->bp;
-  state->rax = regs->ax;
-  state->r9 = regs->r9;
-  state->r11 = regs->r11;
-  state->r13 = regs->r13;
-  state->r15 = regs->r15;
-
-  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847
-  state->return_address = interrupted_kernelmode && regs->orig_ax != -1;
-#elif defined(__aarch64__)
-  // For backwards compatibility aarch64 can run 32-bit code.
-  // Check if the process is running in this 32-bit compat mod.
-  if (regs->pstate & PSR_MODE32_BIT) {
-    return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE;
-  }
-  state->pc = normalize_pac_ptr(regs->pc);
-  state->sp = regs->sp;
-  state->fp = regs->regs[29];
-  state->lr = normalize_pac_ptr(regs->regs[30]);
-  state->r22 = regs->regs[22];
-
-  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209
-  //
-  // Note: We do not use `unwinder_mark_nonleaf_frame` here,
-  // because the frame is a leaf frame from the perspective of the user stack,
-  // regardless of whether we are in a syscall.
-  state->return_address = interrupted_kernelmode && regs->syscallno != -1;
-  state->lr_invalid = false;
-#endif
-
-  return ERR_OK;
-}
-
-#ifndef TESTING_COREDUMP
-
-// Read the task's entry stack pt_regs. This has identical functionality
-// to bpf_task_pt_regs which is emulated to support older kernels.
-// Once kernel requirement is increased to 5.15 this can be replaced with
-// the bpf_task_pt_regs() helper.
-static inline
-long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) {
-  u64 stack_ptr = (u64)task + syscfg->task_stack_offset;
-  long stack_base;
-  if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) {
-    return 0;
-  }
-  return stack_base + syscfg->stack_ptregs_offset;
-}
-
-// Determine whether the given pt_regs are from user-mode register context.
-// This needs to detect also invalid pt_regs in case we its kernel thread stack
-// without valid user mode pt_regs so is_kernel_address(pc) is not enough.
-static inline
-bool ptregs_is_usermode(struct pt_regs *regs) {
-#if defined(__x86_64__)
-  // On x86_64 the user mode SS should always be __USER_DS.
-  if (regs->ss != __USER_DS) {
-    return false;
-  }
-  return true;
-#elif defined(__aarch64__)
-  // Check if the processor state is in the EL0t what linux uses for usermode.
-  if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) {
-    return false;
-  }
-  return true;
-#else
-#error add support for new architecture
-#endif
-}
-
-// Extract the usermode pt_regs for current task. Use context given pt_regs
-// if it is usermode regs, or resolve it via struct task_struct.
-//
-// State registers are not touched (get_pristine_per_cpu_record already reset it)
-// if something fails. has_usermode_regs is set to true if a user-mode register
-// context was found: not every thread that we interrupt will actually have
-// a user-mode context (e.g. kernel worker threads won't).
-static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
-                                          UnwindState *state,
-                                          bool *has_usermode_regs) {
-  ErrorCode error;
-
-  if (!ptregs_is_usermode(ctx)) {
-    u32 key = 0;
-    SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
-    if (!syscfg) {
-      // Unreachable: array maps are always fully initialized.
-      return ERR_UNREACHABLE;
-    }
-
-    // Use the current task's entry pt_regs
-    struct task_struct *task = (struct task_struct *) bpf_get_current_task();
-    long ptregs_addr = get_task_pt_regs(task, syscfg);
-
-    struct pt_regs regs;
-    if (!ptregs_addr || bpf_probe_read_kernel(&regs, sizeof(regs), (void*) ptregs_addr)) {
-      increment_metric(metricID_UnwindNativeErrReadKernelModeRegs);
-      return ERR_NATIVE_READ_KERNELMODE_REGS;
-    }
-
-    if (!ptregs_is_usermode(&regs)) {
-      // No usermode registers context found.
-      return ERR_OK;
-    }
-    error = copy_state_regs(state, &regs, true);
-  } else {
-    // User mode code interrupted, registers are available via the ebpf context.
-    error = copy_state_regs(state, ctx, false);
-  }
-  if (error == ERR_OK) {
-    DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp);
-    *has_usermode_regs = true;
-  }
-  return error;
-}
-
-#else // TESTING_COREDUMP
-
-static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
-                                          UnwindState *state,
-                                          bool *has_usermode_regs) {
-  // Coredumps provide always usermode pt_regs directly.
-  ErrorCode error = copy_state_regs(state, ctx, false);
-  if (error == ERR_OK) {
-    *has_usermode_regs = true;
-  }
-  return error;
-}
-
-#endif
-
-SEC("perf_event/unwind_native")
+// unwind_native is the tail call destination for PROG_UNWIND_NATIVE.
+static inline __attribute__((__always_inline__))
 int unwind_native(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -809,8 +628,8 @@ int unwind_native(struct pt_regs *ctx) {
   return -1;
 }
 
-static inline
-int collect_trace(struct pt_regs *ctx) {
+SEC("perf_event/native_tracer_entry")
+int native_tracer_entry(struct bpf_perf_event_data *ctx) {
   // Get the PID and TGID register.
   u64 id = bpf_get_current_pid_tgid();
   u32 pid = id >> 32;
@@ -820,54 +639,7 @@ int collect_trace(struct pt_regs *ctx) {
     return 0;
   }
 
-  u64 ktime = bpf_ktime_get_ns();
-
-  DEBUG_PRINT("==== do_perf_event ====");
-
-  // The trace is reused on each call to this function so we have to reset the
-  // variables used to maintain state.
-  DEBUG_PRINT("Resetting CPU record");
-  PerCPURecord *record = get_pristine_per_cpu_record();
-  if (!record) {
-    return -1;
-  }
-
-  Trace *trace = &record->trace;
-  trace->pid = pid;
-  trace->tid = tid;
-  trace->ktime = ktime;
-  if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) {
-    increment_metric(metricID_ErrBPFCurrentComm);
-  }
-
-  // Get the kernel mode stack trace first
-  trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
-  DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
-
-  // Recursive unwind frames
-  int unwinder = PROG_UNWIND_STOP;
-  bool has_usermode_regs = false;
-  ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs);
-  if (error || !has_usermode_regs) {
-    goto exit;
-  }
-
-  if (!pid_information_exists(ctx, pid)) {
-    if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) {
-      increment_metric(metricID_NumProcNew);
-    }
-    return 0;
-  }
-  error = get_next_unwinder_after_native_frame(record, &unwinder);
-
-exit:
-  record->state.unwind_error = error;
-  tail_call(ctx, unwinder);
-  DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder);
-  return -1;
-}
-
-SEC("perf_event/native_tracer_entry")
-int native_tracer_entry(struct bpf_perf_event_data *ctx) {
-  return collect_trace((struct pt_regs*) &ctx->regs);
+  u64 ts = bpf_ktime_get_ns();
+  return collect_trace((struct pt_regs*) &ctx->regs, TRACE_SAMPLING, pid, tid, ts, 0);
 }
+MULTI_USE_FUNC(unwind_native)
diff --git a/support/ebpf/off_cpu.ebpf.c b/support/ebpf/off_cpu.ebpf.c
new file mode 100644
index 00000000..732df70a
--- /dev/null
+++ b/support/ebpf/off_cpu.ebpf.c
@@ -0,0 +1,87 @@
+#include "bpfdefs.h"
+#include "tracemgmt.h"
+#include "types.h"
+
+// kprobe_progs maps from a program ID to a kprobe eBPF program
+bpf_map_def SEC("maps") kprobe_progs = {
+  .type = BPF_MAP_TYPE_PROG_ARRAY,
+  .key_size = sizeof(u32),
+  .value_size = sizeof(u32),
+  .max_entries = NUM_TRACER_PROGS,
+};
+
+// sched_times keeps track of sched_switch call times.
+bpf_map_def SEC("maps") sched_times = {
+  .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+  .key_size = sizeof(u64),   // pid_tgid
+  .value_size = sizeof(u64), // time in ns
+  .max_entries = 256,
+};
+
+// tracepoint__sched_switch serves as entry point for off cpu profiling.
+SEC("tracepoint/sched/sched_switch")
+int tracepoint__sched_switch(void *ctx) {
+  u64 pid_tgid = bpf_get_current_pid_tgid();
+  u32 pid = pid_tgid >> 32;
+  u32 tid = pid_tgid & 0xFFFFFFFF;
+
+  if (pid == 0 || tid == 0) {
+    return 0;
+  }
+
+  u32 key = 0;
+  SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
+  if (!syscfg) {
+    // Unreachable: array maps are always fully initialized.
+    return ERR_UNREACHABLE;
+  }
+
+  if (bpf_get_prandom_u32() % OFF_CPU_THRESHOLD_MAX >
+      syscfg->off_cpu_threshold) {
+    return 0;
+  }
+
+  u64 ts = bpf_ktime_get_ns();
+
+  if (bpf_map_update_elem(&sched_times, &pid_tgid, &ts, BPF_ANY) < 0) {
+    DEBUG_PRINT("Failed to record sched_switch event entry");
+    return 0;
+  }
+
+  return 0;
+}
+
+// dummy is never loaded or called. It just makes sure kprobe_progs is
+// referenced and make the compiler and linker happy.
+SEC("kprobe/dummy")
+int dummy(struct pt_regs *ctx) {
+  bpf_tail_call(ctx, &kprobe_progs, 0);
+  return 0;
+}
+
+// kp__finish_task_switch is triggered right after the scheduler updated
+// the CPU registers.
+SEC("kprobe/finish_task_switch")
+int finish_task_switch(struct pt_regs *ctx) {
+  // Get the PID and TGID register.
+  u64 pid_tgid = bpf_get_current_pid_tgid();
+  u32 pid = pid_tgid >> 32;
+  u32 tid = pid_tgid & 0xFFFFFFFF;
+
+  if (pid == 0 || tid == 0) {
+    return 0;
+  }
+
+  u64 ts = bpf_ktime_get_ns();
+
+  u64 *start_ts = bpf_map_lookup_elem(&sched_times, &pid_tgid);
+  if (!start_ts || *start_ts == 0) {
+    // There is no information from the sched/sched_switch entry hook.
+    return 0;
+  }
+
+  u64 diff = ts - *start_ts;
+  DEBUG_PRINT("==== finish_task_switch ====");
+
+  return collect_trace(ctx, TRACE_OFF_CPU, pid, tid, ts, diff);
+}
diff --git a/support/ebpf/perl_tracer.ebpf.c b/support/ebpf/perl_tracer.ebpf.c
index c498341b..1e143468 100644
--- a/support/ebpf/perl_tracer.ebpf.c
+++ b/support/ebpf/perl_tracer.ebpf.c
@@ -356,7 +356,7 @@ int walk_perl_stack(PerCPURecord *record, const PerlProcInfo *perlinfo) {
 // unwind_perl is the entry point for tracing when invoked from the native tracer
 // or interpreter dispatcher. It does not reset the trace object and will append the
 // Perl stack frames to the trace object for the current CPU.
-SEC("perf_event/unwind_perl")
+static inline __attribute__((__always_inline__))
 int unwind_perl(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record) {
@@ -426,3 +426,4 @@ int unwind_perl(struct pt_regs *ctx) {
   tail_call(ctx, unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_perl)
diff --git a/support/ebpf/php_tracer.ebpf.c b/support/ebpf/php_tracer.ebpf.c
index 677cd185..506f63e3 100644
--- a/support/ebpf/php_tracer.ebpf.c
+++ b/support/ebpf/php_tracer.ebpf.c
@@ -182,7 +182,8 @@ int walk_php_stack(PerCPURecord *record, PHPProcInfo *phpinfo, bool is_jitted) {
   return unwinder;
 }
 
-SEC("perf_event/unwind_php")
+// unwind_php is the tail call destination for PROG_UNWIND_PHP.
+static inline __attribute__((__always_inline__))
 int unwind_php(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -239,3 +240,4 @@ int unwind_php(struct pt_regs *ctx) {
   tail_call(ctx, unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_php)
diff --git a/support/ebpf/python_tracer.ebpf.c b/support/ebpf/python_tracer.ebpf.c
index d99147be..9d03375f 100644
--- a/support/ebpf/python_tracer.ebpf.c
+++ b/support/ebpf/python_tracer.ebpf.c
@@ -276,7 +276,7 @@ ErrorCode get_PyFrame(const PyProcInfo *pyinfo, void **frame) {
 // unwind_python is the entry point for tracing when invoked from the native tracer
 // or interpreter dispatcher. It does not reset the trace object and will append the
 // Python stack frames to the trace object for the current CPU.
-SEC("perf_event/unwind_python")
+static inline __attribute__((__always_inline__))
 int unwind_python(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -318,3 +318,4 @@ int unwind_python(struct pt_regs *ctx) {
   tail_call(ctx, unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_python)
diff --git a/support/ebpf/ruby_tracer.ebpf.c b/support/ebpf/ruby_tracer.ebpf.c
index 41ecacaa..57fa1101 100644
--- a/support/ebpf/ruby_tracer.ebpf.c
+++ b/support/ebpf/ruby_tracer.ebpf.c
@@ -216,7 +216,8 @@ ErrorCode walk_ruby_stack(PerCPURecord *record, const RubyProcInfo *rubyinfo,
   return ERR_OK;
 }
 
-SEC("perf_event/unwind_ruby")
+// unwind_ruby is the tail call destination for PROG_UNWIND_RUBY.
+static inline __attribute__((__always_inline__))
 int unwind_ruby(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record)
@@ -273,3 +274,4 @@ int unwind_ruby(struct pt_regs *ctx) {
   tail_call(ctx, unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_ruby)
diff --git a/support/ebpf/tracemgmt.h b/support/ebpf/tracemgmt.h
index c0e08a45..ab347190 100644
--- a/support/ebpf/tracemgmt.h
+++ b/support/ebpf/tracemgmt.h
@@ -10,6 +10,19 @@
 #include "types.h"
 #include "errors.h"
 
+// MULTI_USE_FUNC generates perf event and kprobe eBPF programs
+// for a given function.
+#define MULTI_USE_FUNC(func_name) \
+    SEC("perf_event/"#func_name) \
+    int perf_##func_name(struct pt_regs *ctx) { \
+        return func_name(ctx); \
+    } \
+    \
+    SEC("kprobe/"#func_name) \
+    int kprobe_##func_name(struct pt_regs *ctx) { \
+        return func_name(ctx); \
+    }
+
 // increment_metric increments the value of the given metricID by 1
 static inline __attribute__((__always_inline__))
 void increment_metric(u32 metricID) {
@@ -443,10 +456,10 @@ int get_next_unwinder_after_interpreter(const PerCPURecord *record) {
 // tail_call is a wrapper around bpf_tail_call() and ensures that the number of tail calls is not
 // reached while unwinding the stack.
 static inline __attribute__((__always_inline__))
-void tail_call(void *ctx, int next) {
+void tail_call(void *ctx, int next) { 
   PerCPURecord *record = get_per_cpu_record();
   if (!record) {
-    bpf_tail_call(ctx, &progs, PROG_UNWIND_STOP);
+    bpf_tail_call(ctx, &perf_progs, PROG_UNWIND_STOP);
     // In theory bpf_tail_call() should never return. But due to instruction reordering by the
     // compiler we have to place return here to bribe the verifier to accept this.
     return;
@@ -464,7 +477,237 @@ void tail_call(void *ctx, int next) {
   }
   record->tailCalls += 1 ;
 
-  bpf_tail_call(ctx, &progs, next);
+  bpf_tail_call(ctx, &perf_progs, next);
+}
+
+#ifndef __USER32_CS
+  // defined in arch/x86/include/asm/segment.h
+  #define GDT_ENTRY_DEFAULT_USER32_CS  4
+  #define GDT_ENTRY_DEFAULT_USER_DS    5
+  #define __USER32_CS                 (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+  #define __USER_DS                   (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#endif
+
+#ifdef __aarch64__
+// Strips the PAC tag from a pointer.
+//
+// While all pointers can contain PAC tags, we only apply this function to code pointers, because
+// that's where normalization is required to make the stack delta lookups work. Note that if that
+// should ever change, we'd need a different mask for the data pointers, because it might diverge
+// from the mask for code pointers.
+static inline u64 normalize_pac_ptr(u64 ptr) {
+  // Retrieve PAC mask from the system config.
+  u32 key = 0;
+  SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
+  if (!syscfg) {
+    // Unreachable: array maps are always fully initialized.
+    return ptr;
+  }
+
+  // Mask off PAC bits. Since we're always applying this to usermode pointers that should have all
+  // the high bits set to 0, we don't need to consider the case of having to fill up the resulting
+  // hole with 1s (like we'd have to for kernel ptrs).
+  ptr &= syscfg->inverse_pac_mask;
+  return ptr;
+}
+#endif
+
+// Initialize state from pt_regs
+static inline ErrorCode copy_state_regs(UnwindState *state,
+                                        struct pt_regs *regs,
+                                        bool interrupted_kernelmode)
+{
+#if defined(__x86_64__)
+  // Check if the process is running in 32-bit mode on the x86_64 system.
+  // This check follows the Linux kernel implementation of user_64bit_mode() in
+  // arch/x86/include/asm/ptrace.h.
+  if (regs->cs == __USER32_CS) {
+    return ERR_NATIVE_X64_32BIT_COMPAT_MODE;
+  }
+  state->pc = regs->ip;
+  state->sp = regs->sp;
+  state->fp = regs->bp;
+  state->rax = regs->ax;
+  state->r9 = regs->r9;
+  state->r11 = regs->r11;
+  state->r13 = regs->r13;
+  state->r15 = regs->r15;
+
+  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847
+  state->return_address = interrupted_kernelmode && regs->orig_ax != -1;
+#elif defined(__aarch64__)
+  // For backwards compatibility aarch64 can run 32-bit code.
+  // Check if the process is running in this 32-bit compat mod.
+  if (regs->pstate & PSR_MODE32_BIT) {
+    return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE;
+  }
+  state->pc = normalize_pac_ptr(regs->pc);
+  state->sp = regs->sp;
+  state->fp = regs->regs[29];
+  state->lr = normalize_pac_ptr(regs->regs[30]);
+  state->r22 = regs->regs[22];
+
+  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209
+  //
+  // Note: We do not use `unwinder_mark_nonleaf_frame` here,
+  // because the frame is a leaf frame from the perspective of the user stack,
+  // regardless of whether we are in a syscall.
+  state->return_address = interrupted_kernelmode && regs->syscallno != -1;
+  state->lr_invalid = false;
+#endif
+
+  return ERR_OK;
+}
+
+#ifndef TESTING_COREDUMP
+
+// Read the task's entry stack pt_regs. This has identical functionality
+// to bpf_task_pt_regs which is emulated to support older kernels.
+// Once kernel requirement is increased to 5.15 this can be replaced with
+// the bpf_task_pt_regs() helper.
+static inline
+long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) {
+  u64 stack_ptr = (u64)task + syscfg->task_stack_offset;
+  long stack_base;
+  if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) {
+    return 0;
+  }
+  return stack_base + syscfg->stack_ptregs_offset;
+}
+
+// Determine whether the given pt_regs are from user-mode register context.
+// This needs to detect also invalid pt_regs in case we its kernel thread stack
+// without valid user mode pt_regs so is_kernel_address(pc) is not enough.
+static inline
+bool ptregs_is_usermode(struct pt_regs *regs) {
+#if defined(__x86_64__)
+  // On x86_64 the user mode SS should always be __USER_DS.
+  if (regs->ss != __USER_DS) {
+    return false;
+  }
+  return true;
+#elif defined(__aarch64__)
+  // Check if the processor state is in the EL0t what linux uses for usermode.
+  if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+    return false;
+  }
+  return true;
+#else
+#error add support for new architecture
+#endif
+}
+
+// Extract the usermode pt_regs for current task. Use context given pt_regs
+// if it is usermode regs, or resolve it via struct task_struct.
+//
+// State registers are not touched (get_pristine_per_cpu_record already reset it)
+// if something fails. has_usermode_regs is set to true if a user-mode register
+// context was found: not every thread that we interrupt will actually have
+// a user-mode context (e.g. kernel worker threads won't).
+static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
+                                          UnwindState *state,
+                                          bool *has_usermode_regs) {
+  ErrorCode error;
+
+  if (!ptregs_is_usermode(ctx)) {
+    u32 key = 0;
+    SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
+    if (!syscfg) {
+      // Unreachable: array maps are always fully initialized.
+      return ERR_UNREACHABLE;
+    }
+
+    // Use the current task's entry pt_regs
+    struct task_struct *task = (struct task_struct *) bpf_get_current_task();
+    long ptregs_addr = get_task_pt_regs(task, syscfg);
+
+    struct pt_regs regs;
+    if (!ptregs_addr || bpf_probe_read_kernel(&regs, sizeof(regs), (void*) ptregs_addr)) {
+      increment_metric(metricID_UnwindNativeErrReadKernelModeRegs);
+      return ERR_NATIVE_READ_KERNELMODE_REGS;
+    }
+
+    if (!ptregs_is_usermode(&regs)) {
+      // No usermode registers context found.
+      return ERR_OK;
+    }
+    error = copy_state_regs(state, &regs, true);
+  } else {
+    // User mode code interrupted, registers are available via the ebpf context.
+    error = copy_state_regs(state, ctx, false);
+  }
+  if (error == ERR_OK) {
+    DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp);
+    *has_usermode_regs = true;
+  }
+  return error;
+}
+
+#else // TESTING_COREDUMP
+
+static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
+                                          UnwindState *state,
+                                          bool *has_usermode_regs) {
+  // Coredumps provide always usermode pt_regs directly.
+  ErrorCode error = copy_state_regs(state, ctx, false);
+  if (error == ERR_OK) {
+    *has_usermode_regs = true;
+  }
+  return error;
+}
+
+#endif // TESTING_COREDUMP
+
+static inline
+int collect_trace(struct pt_regs *ctx, TraceOrigin origin, u32 pid, u32 tid,
+  u64 trace_timestamp, u64 off_cpu_time) {
+  // The trace is reused on each call to this function so we have to reset the
+  // variables used to maintain state.
+  DEBUG_PRINT("Resetting CPU record");
+  PerCPURecord *record = get_pristine_per_cpu_record();
+  if (!record) {
+    return -1;
+  }
+
+  Trace *trace = &record->trace;
+  trace->origin = origin;
+  trace->pid = pid;
+  trace->tid = tid;
+  trace->ktime = trace_timestamp;
+  trace->offtime = off_cpu_time;
+  if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) {
+    increment_metric(metricID_ErrBPFCurrentComm);
+  }
+
+  // Get the kernel mode stack trace first
+  trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
+  DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
+
+  // Recursive unwind frames
+  int unwinder = PROG_UNWIND_STOP;
+  bool has_usermode_regs = false;
+  ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs);
+  if (error || !has_usermode_regs) {
+    goto exit;
+  }
+
+  if (!pid_information_exists(ctx, pid)) {
+    if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) {
+      increment_metric(metricID_NumProcNew);
+    }
+    return 0;
+  }
+  error = get_next_unwinder_after_native_frame(record, &unwinder);
+
+exit:
+  record->state.unwind_error = error;
+  tail_call(ctx, unwinder);
+  DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder);
+  return -1;
 }
 
 #endif
diff --git a/support/ebpf/tracer.ebpf.release.amd64 b/support/ebpf/tracer.ebpf.release.amd64
index 2a8cbb2f..62e2f66a 100644
Binary files a/support/ebpf/tracer.ebpf.release.amd64 and b/support/ebpf/tracer.ebpf.release.amd64 differ
diff --git a/support/ebpf/tracer.ebpf.release.arm64 b/support/ebpf/tracer.ebpf.release.arm64
index ecac9895..2a226a6c 100644
Binary files a/support/ebpf/tracer.ebpf.release.arm64 and b/support/ebpf/tracer.ebpf.release.arm64 differ
diff --git a/support/ebpf/types.h b/support/ebpf/types.h
index e5592ff8..da3a9efd 100644
--- a/support/ebpf/types.h
+++ b/support/ebpf/types.h
@@ -331,6 +331,17 @@ typedef enum TracePrograms {
   NUM_TRACER_PROGS,
 } TracePrograms;
 
+// TraceOrigin describes the source of the trace. This enables
+// origin specific handling of traces in user space.
+typedef enum TraceOrigin {
+  TRACE_UNKNOWN,
+  TRACE_SAMPLING,
+  TRACE_OFF_CPU,
+} TraceOrigin;
+
+// OFF_CPU_THRESHOLD_MAX defines the maximum threshold.
+#define OFF_CPU_THRESHOLD_MAX 1000
+
 // MAX_FRAME_UNWINDS defines the maximum number of frames per
 // Trace we can unwind and respect the limit of eBPF instructions,
 // limit of tail calls and limit of stack size per eBPF program.
@@ -532,6 +543,13 @@ typedef struct Trace {
   s32 kernel_stack_id;
   // The number of frames in the stack.
   u32 stack_len;
+
+  // origin indicates the source of the trace.
+  TraceOrigin origin;
+
+  // offtime stores the nanoseconds that the trace was off-cpu for.
+  u64 offtime;
+
   // The frames of the stack trace.
   Frame frames[MAX_FRAME_UNWINDS];
 
@@ -851,6 +869,9 @@ typedef struct SystemConfig {
   // The offset of struct pt_regs within the kernel entry stack.
   u32 stack_ptregs_offset;
 
+  // User defined threshold for off-cpu profiling.
+  u32 off_cpu_threshold;
+
   // Enables the temporary hack that drops pure errors frames in unwind_stop.
   bool drop_error_only_traces;
 } SystemConfig;
diff --git a/support/ebpf/v8_tracer.ebpf.c b/support/ebpf/v8_tracer.ebpf.c
index d6da2a71..1faa1834 100644
--- a/support/ebpf/v8_tracer.ebpf.c
+++ b/support/ebpf/v8_tracer.ebpf.c
@@ -284,7 +284,7 @@ ErrorCode unwind_one_v8_frame(PerCPURecord *record, V8ProcInfo *vi, bool top) {
 // unwind_v8 is the entry point for tracing when invoked from the native tracer
 // or interpreter dispatcher. It does not reset the trace object and will append the
 // V8 stack frames to the trace object for the current CPU.
-SEC("perf_event/unwind_v8")
+static inline __attribute__((__always_inline__))
 int unwind_v8(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
   if (!record) {
@@ -328,3 +328,4 @@ int unwind_v8(struct pt_regs *ctx) {
   DEBUG_PRINT("v8: tail call for next frame unwinder (%d) failed", unwinder);
   return -1;
 }
+MULTI_USE_FUNC(unwind_v8)
diff --git a/support/types.go b/support/types.go
index 6387e4d0..a37f38aa 100644
--- a/support/types.go
+++ b/support/types.go
@@ -105,3 +105,11 @@ const (
 	// PerfMaxStackDepth is the bpf map data array length for BPF_MAP_TYPE_STACK_TRACE traces
 	PerfMaxStackDepth = C.PERF_MAX_STACK_DEPTH
 )
+
+const (
+	TraceOriginUnknown  = C.TRACE_UNKNOWN
+	TraceOriginSampling = C.TRACE_SAMPLING
+	TraceOriginOffCPU   = C.TRACE_OFF_CPU
+)
+
+const OffCPUThresholdMax = C.OFF_CPU_THRESHOLD_MAX
diff --git a/tracehandler/tracehandler.go b/tracehandler/tracehandler.go
index bb0d657c..d3f9c411 100644
--- a/tracehandler/tracehandler.go
+++ b/tracehandler/tracehandler.go
@@ -128,6 +128,8 @@ func (m *traceHandler) HandleTrace(bpfTrace *host.Trace) {
 		APMServiceName: "", // filled in below
 		CPU:            bpfTrace.CPU,
 		Executable:     bpfTrace.Executable,
+		Origin:         bpfTrace.Origin,
+		OffTime:        bpfTrace.OffTime,
 	}
 
 	if !m.reporter.SupportsReportTraceEvent() {
diff --git a/tracer/ebpf_integration_test.go b/tracer/ebpf_integration_test.go
index 19e3c648..6b784466 100644
--- a/tracer/ebpf_integration_test.go
+++ b/tracer/ebpf_integration_test.go
@@ -56,7 +56,7 @@ func runKernelFrameProbe(t *testing.T, tracer *Tracer) {
 	require.NoError(t, err)
 	defer restoreRlimit()
 
-	prog, err := cebpf.NewProgram(coll.Programs["tracepoint__sched_switch"])
+	prog, err := cebpf.NewProgram(coll.Programs["tracepoint_integration__sched_switch"])
 	require.NoError(t, err)
 	defer prog.Close()
 
@@ -255,7 +255,13 @@ func TestAllTracers(t *testing.T) {
 	kernelSymbols, err := proc.GetKallsyms("/proc/kallsyms")
 	require.NoError(t, err)
 
-	_, _, err = initializeMapsAndPrograms(tracertypes.AllTracers(), kernelSymbols,
-		false, 1, false, false, 0)
+	_, _, err = initializeMapsAndPrograms(kernelSymbols, &Config{
+		IncludeTracers:      tracertypes.AllTracers(),
+		MapScaleFactor:      1,
+		FilterErrorFrames:   false,
+		KernelVersionCheck:  false,
+		DebugTracer:         false,
+		BPFVerifierLogLevel: 0,
+	})
 	require.NoError(t, err)
 }
diff --git a/tracer/systemconfig.go b/tracer/systemconfig.go
index 0e3a3daf..b8dc3ad4 100644
--- a/tracer/systemconfig.go
+++ b/tracer/systemconfig.go
@@ -227,7 +227,7 @@ func determineStackLayout(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map
 
 func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map,
 	kernelSymbols *libpf.SymbolMap, includeTracers types.IncludedTracers,
-	filterErrorFrames bool) error {
+	offCPUThreshold uint32, filterErrorFrames bool) error {
 	pacMask := pacmask.GetPACMask()
 	if pacMask != 0 {
 		log.Infof("Determined PAC mask to be 0x%016X", pacMask)
@@ -237,6 +237,7 @@ func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map,
 	syscfg := C.SystemConfig{
 		inverse_pac_mask:       ^C.u64(pacMask),
 		drop_error_only_traces: C.bool(filterErrorFrames),
+		off_cpu_threshold:      C.u32(offCPUThreshold),
 	}
 
 	if err := parseBTF(&syscfg); err != nil {
diff --git a/tracer/tracer.go b/tracer/tracer.go
index 64caeafd..4cbbf203 100644
--- a/tracer/tracer.go
+++ b/tracer/tracer.go
@@ -19,6 +19,7 @@ import (
 	"unsafe"
 
 	cebpf "github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/link"
 	"github.com/elastic/go-perf"
 	log "github.com/sirupsen/logrus"
@@ -153,6 +154,8 @@ type Config struct {
 	ProbabilisticInterval time.Duration
 	// ProbabilisticThreshold is the threshold for probabilistic profiling.
 	ProbabilisticThreshold uint
+	// OffCPUThreshold is the user defined threshold for off-cpu profiling.
+	OffCPUThreshold uint32
 }
 
 // hookPoint specifies the group and name of the hooked point in the kernel.
@@ -160,6 +163,18 @@ type hookPoint struct {
 	group, name string
 }
 
+// progLoaderHelper supports the loading process of eBPF programs.
+type progLoaderHelper struct {
+	// enable tells whether a prog shall be loaded.
+	enable bool
+	// name of the eBPF program
+	name string
+	// progID defines the ID for the eBPF program that is used as key in the tailcallMap.
+	progID uint32
+	// noTailCallTarget indicates if this eBPF program should be added to the tailcallMap.
+	noTailCallTarget bool
+}
+
 // processKernelModulesMetadata computes the FileID of kernel files and reports executable metadata
 // for all kernel modules and the vmlinux image.
 func processKernelModulesMetadata(rep reporter.SymbolReporter, kernelModules *libpf.SymbolMap,
@@ -267,9 +282,7 @@ func NewTracer(ctx context.Context, cfg *Config) (*Tracer, error) {
 	}
 
 	// Based on includeTracers we decide later which are loaded into the kernel.
-	ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(cfg.IncludeTracers, kernelSymbols,
-		cfg.FilterErrorFrames, cfg.MapScaleFactor, cfg.KernelVersionCheck, cfg.DebugTracer,
-		cfg.BPFVerifierLogLevel)
+	ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(kernelSymbols, cfg)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load eBPF code: %v", err)
 	}
@@ -369,9 +382,7 @@ func buildStackDeltaTemplates(coll *cebpf.CollectionSpec) error {
 
 // initializeMapsAndPrograms loads the definitions for the eBPF maps and programs provided
 // by the embedded elf file and loads these into the kernel.
-func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
-	kernelSymbols *libpf.SymbolMap, filterErrorFrames bool, mapScaleFactor int,
-	kernelVersionCheck bool, debugTracer bool, bpfVerifierLogLevel uint32) (
+func initializeMapsAndPrograms(kernelSymbols *libpf.SymbolMap, cfg *Config) (
 	ebpfMaps map[string]*cebpf.Map, ebpfProgs map[string]*cebpf.Program, err error) {
 	// Loading specifications about eBPF programs and maps from the embedded elf file
 	// does not load them into the kernel.
@@ -379,7 +390,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 	// References to eBPF maps in the eBPF programs are just placeholders that need to be
 	// replaced by the actual loaded maps later on with RewriteMaps before loading the
 	// programs into the kernel.
-	coll, err := support.LoadCollectionSpec(debugTracer)
+	coll, err := support.LoadCollectionSpec(cfg.DebugTracer)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to load specification for tracers: %v", err)
 	}
@@ -395,7 +406,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 	// Load all maps into the kernel that are used later on in eBPF programs. So we can rewrite
 	// in the next step the placesholders in the eBPF programs with the file descriptors of the
 	// loaded maps in the kernel.
-	if err = loadAllMaps(coll, ebpfMaps, mapScaleFactor); err != nil {
+	if err = loadAllMaps(coll, ebpfMaps, cfg.MapScaleFactor); err != nil {
 		return nil, nil, fmt.Errorf("failed to load eBPF maps: %v", err)
 	}
 
@@ -406,7 +417,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 		return nil, nil, fmt.Errorf("failed to rewrite maps: %v", err)
 	}
 
-	if kernelVersionCheck {
+	if cfg.KernelVersionCheck {
 		var major, minor, patch uint32
 		major, minor, patch, err = GetCurrentKernelVersion()
 		if err != nil {
@@ -426,13 +437,68 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 		}
 	}
 
-	if err = loadUnwinders(coll, ebpfProgs, ebpfMaps["progs"], includeTracers,
-		bpfVerifierLogLevel); err != nil {
-		return nil, nil, fmt.Errorf("failed to load eBPF programs: %v", err)
+	tailCallProgs := []progLoaderHelper{
+		{
+			progID: uint32(support.ProgUnwindStop),
+			name:   "unwind_stop",
+			enable: true,
+		},
+		{
+			progID: uint32(support.ProgUnwindNative),
+			name:   "unwind_native",
+			enable: true,
+		},
+		{
+			progID: uint32(support.ProgUnwindHotspot),
+			name:   "unwind_hotspot",
+			enable: cfg.IncludeTracers.Has(types.HotspotTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPerl),
+			name:   "unwind_perl",
+			enable: cfg.IncludeTracers.Has(types.PerlTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPHP),
+			name:   "unwind_php",
+			enable: cfg.IncludeTracers.Has(types.PHPTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPython),
+			name:   "unwind_python",
+			enable: cfg.IncludeTracers.Has(types.PythonTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindRuby),
+			name:   "unwind_ruby",
+			enable: cfg.IncludeTracers.Has(types.RubyTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindV8),
+			name:   "unwind_v8",
+			enable: cfg.IncludeTracers.Has(types.V8Tracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindDotnet),
+			name:   "unwind_dotnet",
+			enable: cfg.IncludeTracers.Has(types.DotnetTracer),
+		},
+	}
+
+	if err = loadPerfUnwinders(coll, ebpfProgs, ebpfMaps["perf_progs"], tailCallProgs,
+		cfg.BPFVerifierLogLevel); err != nil {
+		return nil, nil, fmt.Errorf("failed to load perf eBPF programs: %v", err)
 	}
 
-	if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, includeTracers,
-		filterErrorFrames); err != nil {
+	if cfg.OffCPUThreshold < support.OffCPUThresholdMax {
+		if err = loadKProbeUnwinders(coll, ebpfProgs, ebpfMaps["kprobe_progs"], tailCallProgs,
+			cfg.BPFVerifierLogLevel, ebpfMaps["perf_progs"].FD()); err != nil {
+			return nil, nil, fmt.Errorf("failed to load kprobe eBPF programs: %v", err)
+		}
+	}
+
+	if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, cfg.IncludeTracers,
+		cfg.OffCPUThreshold, cfg.FilterErrorFrames); err != nil {
 		return nil, nil, fmt.Errorf("failed to load system config: %v", err)
 	}
 
@@ -503,126 +569,170 @@ func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map,
 	return nil
 }
 
-// loadUnwinders just satisfies the proof of concept and loads all eBPF programs
-func loadUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
-	tailcallMap *cebpf.Map, includeTracers types.IncludedTracers,
+// loadPerfUnwinders loads all perf eBPF Programs and their tail call targets.
+func loadPerfUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
+	tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper,
 	bpfVerifierLogLevel uint32) error {
-	restoreRlimit, err := rlimit.MaximizeMemlock()
-	if err != nil {
-		return fmt.Errorf("failed to adjust rlimit: %v", err)
+	programOptions := cebpf.ProgramOptions{
+		LogLevel: cebpf.LogLevel(bpfVerifierLogLevel),
 	}
-	defer restoreRlimit()
 
-	type prog struct {
-		// enable tells whether a prog shall be loaded.
-		enable bool
-		// name of the eBPF program
-		name string
-		// progID defines the ID for the eBPF program that is used as key in the tailcallMap.
-		progID uint32
-		// noTailCallTarget indicates if this eBPF program should be added to the tailcallMap.
-		noTailCallTarget bool
+	progs := make([]progLoaderHelper, len(tailCallProgs)+2)
+	copy(progs, tailCallProgs)
+	progs = append(progs,
+		progLoaderHelper{
+			name:             "tracepoint__sched_process_exit",
+			noTailCallTarget: true,
+			enable:           true,
+		},
+		progLoaderHelper{
+			name:             "native_tracer_entry",
+			noTailCallTarget: true,
+			enable:           true,
+		})
+
+	for _, unwindProg := range progs {
+		if !unwindProg.enable {
+			continue
+		}
+
+		unwindProgName := unwindProg.name
+		if !unwindProg.noTailCallTarget {
+			unwindProgName = "perf_" + unwindProg.name
+		}
+
+		progSpec, ok := coll.Programs[unwindProgName]
+		if !ok {
+			return fmt.Errorf("program %s does not exist", unwindProgName)
+		}
+
+		if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec,
+			programOptions, unwindProg.noTailCallTarget); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// progArrayReferences returns a list of instructions which load a specified tail
+// call FD.
+func progArrayReferences(perfTailCallMapFD int, insns asm.Instructions) []int {
+	insNos := []int{}
+	for i := range insns {
+		ins := &insns[i]
+		if asm.OpCode(ins.OpCode.Class()) != asm.OpCode(asm.LdClass) {
+			continue
+		}
+		m := ins.Map()
+		if m == nil {
+			continue
+		}
+		if perfTailCallMapFD == m.FD() {
+			insNos = append(insNos, i)
+		}
 	}
+	return insNos
+}
 
+// loadKProbeUnwinders reuses large parts of loadPerfUnwinders. By default all eBPF programs
+// are written as perf event eBPF programs. loadKProbeUnwinders dynamically rewrites the
+// specification of these programs to kprobe eBPF programs and adjusts tail call maps.
+func loadKProbeUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
+	tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper,
+	bpfVerifierLogLevel uint32, perfTailCallMapFD int) error {
 	programOptions := cebpf.ProgramOptions{
 		LogLevel: cebpf.LogLevel(bpfVerifierLogLevel),
 	}
 
-	for _, unwindProg := range []prog{
-		{
-			progID: uint32(support.ProgUnwindStop),
-			name:   "unwind_stop",
-			enable: true,
-		},
-		{
-			progID: uint32(support.ProgUnwindNative),
-			name:   "unwind_native",
-			enable: true,
-		},
-		{
-			progID: uint32(support.ProgUnwindHotspot),
-			name:   "unwind_hotspot",
-			enable: includeTracers.Has(types.HotspotTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPerl),
-			name:   "unwind_perl",
-			enable: includeTracers.Has(types.PerlTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPHP),
-			name:   "unwind_php",
-			enable: includeTracers.Has(types.PHPTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPython),
-			name:   "unwind_python",
-			enable: includeTracers.Has(types.PythonTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindRuby),
-			name:   "unwind_ruby",
-			enable: includeTracers.Has(types.RubyTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindV8),
-			name:   "unwind_v8",
-			enable: includeTracers.Has(types.V8Tracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindDotnet),
-			name:   "unwind_dotnet",
-			enable: includeTracers.Has(types.DotnetTracer),
-		},
-		{
-			name:             "tracepoint__sched_process_exit",
+	progs := make([]progLoaderHelper, len(tailCallProgs)+2)
+	copy(progs, tailCallProgs)
+	progs = append(progs,
+		progLoaderHelper{
+			name:             "finish_task_switch",
 			noTailCallTarget: true,
 			enable:           true,
 		},
-		{
-			name:             "native_tracer_entry",
+		progLoaderHelper{
+			name:             "tracepoint__sched_switch",
 			noTailCallTarget: true,
 			enable:           true,
 		},
-	} {
+	)
+
+	for _, unwindProg := range progs {
 		if !unwindProg.enable {
 			continue
 		}
 
-		// Load the eBPF program into the kernel. If no error is returned,
-		// the eBPF program can be used/called/triggered from now on.
-		unwinder, err := cebpf.NewProgramWithOptions(coll.Programs[unwindProg.name],
-			programOptions)
-		if err != nil {
-			// These errors tend to have hundreds of lines (or more),
-			// so we print each line individually.
-			if ve, ok := err.(*cebpf.VerifierError); ok {
-				for _, line := range ve.Log {
-					log.Error(line)
-				}
-			} else {
-				scanner := bufio.NewScanner(strings.NewReader(err.Error()))
-				for scanner.Scan() {
-					log.Error(scanner.Text())
-				}
+		unwindProgName := unwindProg.name
+		if !unwindProg.noTailCallTarget {
+			unwindProgName = "kprobe_" + unwindProg.name
+		}
+
+		progSpec, ok := coll.Programs[unwindProgName]
+		if !ok {
+			return fmt.Errorf("program %s does not exist", unwindProgName)
+		}
+
+		// Replace the prog array for the tail calls.
+		insns := progArrayReferences(perfTailCallMapFD, progSpec.Instructions)
+		for _, ins := range insns {
+			if err := progSpec.Instructions[ins].AssociateMap(tailcallMap); err != nil {
+				return fmt.Errorf("failed to rewrite map ptr: %v", err)
 			}
-			return fmt.Errorf("failed to load %s", unwindProg.name)
 		}
 
-		ebpfProgs[unwindProg.name] = unwinder
-		fd := uint32(unwinder.FD())
-		if unwindProg.noTailCallTarget {
-			continue
+		if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec,
+			programOptions, unwindProg.noTailCallTarget); err != nil {
+			return err
 		}
-		if err := tailcallMap.Update(unsafe.Pointer(&unwindProg.progID), unsafe.Pointer(&fd),
-			cebpf.UpdateAny); err != nil {
-			// Every eBPF program that is loaded within loadUnwinders can be the
-			// destination of a tail call of another eBPF program. If we can not update
-			// the eBPF map that manages these destinations our unwinding will fail.
-			return fmt.Errorf("failed to update tailcall map: %v", err)
+	}
+
+	return nil
+}
+
+// loadProgram loads an eBPF program from progSpec and populates the related maps.
+func loadProgram(ebpfProgs map[string]*cebpf.Program, tailcallMap *cebpf.Map,
+	progID uint32, progSpec *cebpf.ProgramSpec, programOptions cebpf.ProgramOptions,
+	noTailCallTarget bool) error {
+	restoreRlimit, err := rlimit.MaximizeMemlock()
+	if err != nil {
+		return fmt.Errorf("failed to adjust rlimit: %v", err)
+	}
+	defer restoreRlimit()
+
+	// Load the eBPF program into the kernel. If no error is returned,
+	// the eBPF program can be used/called/triggered from now on.
+	unwinder, err := cebpf.NewProgramWithOptions(progSpec, programOptions)
+	if err != nil {
+		// These errors tend to have hundreds of lines (or more),
+		// so we print each line individually.
+		if ve, ok := err.(*cebpf.VerifierError); ok {
+			for _, line := range ve.Log {
+				log.Error(line)
+			}
+		} else {
+			scanner := bufio.NewScanner(strings.NewReader(err.Error()))
+			for scanner.Scan() {
+				log.Error(scanner.Text())
+			}
 		}
+		return fmt.Errorf("failed to load %s", progSpec.Name)
 	}
+	ebpfProgs[progSpec.Name] = unwinder
 
+	if noTailCallTarget {
+		return nil
+	}
+	fd := uint32(unwinder.FD())
+	if err := tailcallMap.Update(unsafe.Pointer(&progID), unsafe.Pointer(&fd),
+		cebpf.UpdateAny); err != nil {
+		// Every eBPF program that is loaded within loadUnwinders can be the
+		// destination of a tail call of another eBPF program. If we can not update
+		// the eBPF map that manages these destinations our unwinding will fail.
+		return fmt.Errorf("failed to update tailcall map: %v", err)
+	}
 	return nil
 }
 
@@ -864,6 +974,8 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *host.Trace {
 		APMTransactionID: *(*libpf.APMTransactionID)(unsafe.Pointer(&ptr.apm_transaction_id)),
 		PID:              pid,
 		TID:              libpf.PID(ptr.tid),
+		Origin:           libpf.Origin(ptr.origin),
+		OffTime:          uint64(ptr.offtime),
 		KTime:            times.KTime(ptr.ktime),
 		CPU:              cpu,
 	}
@@ -871,11 +983,13 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *host.Trace {
 	// Trace fields included in the hash:
 	//  - PID, kernel stack ID, length & frame array
 	// Intentionally excluded:
-	//  - ktime, COMM, APM trace, APM transaction ID
+	//  - ktime, COMM, APM trace, APM transaction ID, Origin and Off Time
 	ptr.comm = [16]C.char{}
 	ptr.apm_trace_id = C.ApmTraceID{}
 	ptr.apm_transaction_id = C.ApmSpanID{}
 	ptr.ktime = 0
+	ptr.origin = 0
+	ptr.offtime = 0
 	trace.Hash = host.TraceHash(xxh3.Hash128(raw).Lo)
 
 	userFrameOffs := 0
@@ -1161,6 +1275,39 @@ func (t *Tracer) StartProbabilisticProfiling(ctx context.Context) {
 	})
 }
 
+// StartOffCPUProfiling starts off-cpu profiling by attaching the programs to the hooks.
+func (t *Tracer) StartOffCPUProfiling() error {
+	// Attach the second hook for off-cpu profiling first.
+	kprobeProg, ok := t.ebpfProgs["finish_task_switch"]
+	if !ok {
+		return errors.New("off-cpu program finish_task_switch is not available")
+	}
+
+	kprobeSymbol, err := t.kernelSymbols.LookupSymbolByPrefix("finish_task_switch")
+	if err != nil {
+		return errors.New("failed to find kernel symbol for finish_task_switch")
+	}
+
+	kprobeLink, err := link.Kprobe(string(kprobeSymbol.Name), kprobeProg, nil)
+	if err != nil {
+		return err
+	}
+	t.hooks[hookPoint{group: "kprobe", name: "finish_task_switch"}] = kprobeLink
+
+	// Attach the first hook that enables off-cpu profiling.
+	tpProg, ok := t.ebpfProgs["tracepoint__sched_switch"]
+	if !ok {
+		return errors.New("tracepoint__sched_switch is not available")
+	}
+	tpLink, err := link.Tracepoint("sched", "sched_switch", tpProg, nil)
+	if err != nil {
+		return nil
+	}
+	t.hooks[hookPoint{group: "sched", name: "sched_switch"}] = tpLink
+
+	return nil
+}
+
 // TraceProcessor gets the trace processor.
 func (t *Tracer) TraceProcessor() tracehandler.TraceProcessor {
 	return t.processManager