Linux 中为跟踪提供了多种程序。这些程序及其使用的底层原理如下:
tracepoints
tracepoint 通过在内核代码中进行插桩来提供跟踪能力。要定义一个 tracepoint,需要使用 DECLARE_TRACE 宏:
#define DECLARE_TRACE(name, proto, args) \
__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1, \
PARAMS(void *__data, proto), \
PARAMS(__data, args))
然后对这个宏进一步封装成 events:
#define DEFINE_EVENT(template, name, proto, args) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT(name, proto, args, struct, assign, print) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
下面是软中断中的插桩函数定义:
DEFINE_EVENT(softirq, softirq_entry,
TP_PROTO(unsigned int vec_nr),
TP_ARGS(vec_nr)
);
DEFINE_EVENT(softirq, softirq_exit,
TP_PROTO(unsigned int vec_nr),
TP_ARGS(vec_nr)
);
DEFINE_EVENT(softirq, softirq_raise,
TP_PROTO(unsigned int vec_nr),
TP_ARGS(vec_nr)
);
使用:
void __do_softirq(void)
{
// ...
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
// ...
}
Linux 进程调度、内存管理、文件系统、网络系统等模块中添加了大量的 tracepoints,提供了各种不同的 events。
tracepoints 本身被设计为可开关的。在关闭的情况下不会影响系统性能。
对于系统允许的所有 events 而言,可以在 /sys/kernel/debug/tracing/events/
中看到。例如:
$ sudo ls /sys/kernel/debug/tracing/events/irq enable filter irq_handler_entry irq_handler_exit softirq_entry softirq_exit softirq_raise tasklet_entry tasklet_exit
如果需要启动,只需要把 enable 置为 1 即可。
perf_events
perf_events 通过监控硬件计数器(PMU, generic performance monitoring unit)提供跟踪能力。
在程序请求后,perf_events 将相关 context 附加到进程的 task_struct 中。当程序某些事件发生时,会更新 perf_events 的 context。
下面是内核中的 perf_event:
/**
* struct perf_event - performance event kernel representation:
*/
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
struct list_head group_entry;
struct list_head event_entry;
struct list_head sibling_list;
struct hlist_node hlist_entry;
int nr_siblings;
int group_flags;
struct perf_event *group_leader;
struct pmu *pmu;
enum perf_event_active_state state;
unsigned int attach_state;
local64_t count;
atomic64_t child_count;
/*
* These are the total time in nanoseconds that the event
* has been enabled (i.e. eligible to run, and the task has
* been scheduled in, if this is a per-task event)
* and running (scheduled onto the CPU), respectively.
*
* They are computed from tstamp_enabled, tstamp_running and
* tstamp_stopped when the event is in INACTIVE or ACTIVE state.
*/
u64 total_time_enabled;
u64 total_time_running;
/*
* These are timestamps used for computing total_time_enabled
* and total_time_running when the event is in INACTIVE or
* ACTIVE state, measured in nanoseconds from an arbitrary point
* in time.
* tstamp_enabled: the notional time when the event was enabled
* tstamp_running: the notional time when the event was scheduled on
* tstamp_stopped: in INACTIVE state, the notional time when the
* event was scheduled off.
*/
u64 tstamp_enabled;
u64 tstamp_running;
u64 tstamp_stopped;
/*
* timestamp shadows the actual context timing but it can
* be safely used in NMI interrupt context. It reflects the
* context time as it was when the event was last scheduled in.
*
* ctx_time already accounts for ctx->timestamp. Therefore to
* compute ctx_time for a sample, simply add perf_clock().
*/
u64 shadow_ctx_time;
struct perf_event_attr attr;
u16 header_size;
u16 id_header_size;
u16 read_size;
struct hw_perf_event hw;
struct perf_event_context *ctx;
struct file *filp;
/*
* These accumulate total time (in nanoseconds) that children
* events have been enabled and running, respectively.
*/
atomic64_t child_total_time_enabled;
atomic64_t child_total_time_running;
/*
* Protect attach/detach and child_list:
*/
struct mutex child_mutex;
struct list_head child_list;
struct perf_event *parent;
int oncpu;
int cpu;
struct list_head owner_entry;
struct task_struct *owner;
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
int mmap_locked;
struct user_struct *mmap_user;
struct perf_buffer *buffer;
/* poll related */
wait_queue_head_t waitq;
struct fasync_struct *fasync;
/* delayed work for NMIs and such */
int pending_wakeup;
int pending_kill;
int pending_disable;
struct irq_work pending;
atomic_t event_limit;
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
struct pid_namespace *ns;
u64 id;
perf_overflow_handler_t overflow_handler;
#ifdef CONFIG_EVENT_TRACING
struct ftrace_event_call *tp_event;
struct event_filter *filter;
#endif
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp; /* cgroup event is attach to */
int cgrp_defer_enabled;
#endif
#endif /* CONFIG_PERF_EVENTS */
};