简介

Linux 中为跟踪提供了多种程序。这些程序及其使用的底层原理如下：

tracepoints

tracepoint 通过在内核代码中进行插桩来提供跟踪能力。要定义一个 tracepoint，需要使用 DECLARE_TRACE 宏：

#define DECLARE_TRACE(name, proto, args)				\
		__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,	\
				PARAMS(void *__data, proto),		\
				PARAMS(__data, args))

然后对这个宏进一步封装成 events：

#define DEFINE_EVENT(template, name, proto, args)		\
	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

下面是软中断中的插桩函数定义：

DEFINE_EVENT(softirq, softirq_entry,
	TP_PROTO(unsigned int vec_nr),
	TP_ARGS(vec_nr)
);

DEFINE_EVENT(softirq, softirq_exit,
	TP_PROTO(unsigned int vec_nr),
	TP_ARGS(vec_nr)
);

DEFINE_EVENT(softirq, softirq_raise,
	TP_PROTO(unsigned int vec_nr),
	TP_ARGS(vec_nr)
);

使用：

void __do_softirq(void)
{
    // ...
    trace_softirq_entry(vec_nr);
    h->action(h);
    trace_softirq_exit(vec_nr);
    // ...
}

Linux 进程调度、内存管理、文件系统、网络系统等模块中添加了大量的 tracepoints，提供了各种不同的 events。

tracepoints 本身被设计为可开关的。在关闭的情况下不会影响系统性能。

对于系统允许的所有 events 而言，可以在 /sys/kernel/debug/tracing/events/ 中看到。例如：

$ sudo ls /sys/kernel/debug/tracing/events/irq
enable  filter  irq_handler_entry  irq_handler_exit  softirq_entry  softirq_exit  softirq_raise  tasklet_entry  tasklet_exit

如果需要启动，只需要把 enable 置为 1 即可。

perf_events

perf_events 通过监控硬件计数器（PMU, generic performance monitoring unit）提供跟踪能力。

在程序请求后，perf_events 将相关 context 附加到进程的 task_struct 中。当程序某些事件发生时，会更新 perf_events 的 context。

下面是内核中的 perf_event：

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
	struct list_head		group_entry;
	struct list_head		event_entry;
	struct list_head		sibling_list;
	struct hlist_node		hlist_entry;
	int				nr_siblings;
	int				group_flags;
	struct perf_event		*group_leader;
	struct pmu			*pmu;

	enum perf_event_active_state	state;
	unsigned int			attach_state;
	local64_t			count;
	atomic64_t			child_count;

	/*
	 * These are the total time in nanoseconds that the event
	 * has been enabled (i.e. eligible to run, and the task has
	 * been scheduled in, if this is a per-task event)
	 * and running (scheduled onto the CPU), respectively.
	 *
	 * They are computed from tstamp_enabled, tstamp_running and
	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
	 */
	u64				total_time_enabled;
	u64				total_time_running;

	/*
	 * These are timestamps used for computing total_time_enabled
	 * and total_time_running when the event is in INACTIVE or
	 * ACTIVE state, measured in nanoseconds from an arbitrary point
	 * in time.
	 * tstamp_enabled: the notional time when the event was enabled
	 * tstamp_running: the notional time when the event was scheduled on
	 * tstamp_stopped: in INACTIVE state, the notional time when the
	 *	event was scheduled off.
	 */
	u64				tstamp_enabled;
	u64				tstamp_running;
	u64				tstamp_stopped;

	/*
	 * timestamp shadows the actual context timing but it can
	 * be safely used in NMI interrupt context. It reflects the
	 * context time as it was when the event was last scheduled in.
	 *
	 * ctx_time already accounts for ctx->timestamp. Therefore to
	 * compute ctx_time for a sample, simply add perf_clock().
	 */
	u64				shadow_ctx_time;

	struct perf_event_attr		attr;
	u16				header_size;
	u16				id_header_size;
	u16				read_size;
	struct hw_perf_event		hw;

	struct perf_event_context	*ctx;
	struct file			*filp;

	/*
	 * These accumulate total time (in nanoseconds) that children
	 * events have been enabled and running, respectively.
	 */
	atomic64_t			child_total_time_enabled;
	atomic64_t			child_total_time_running;

	/*
	 * Protect attach/detach and child_list:
	 */
	struct mutex			child_mutex;
	struct list_head		child_list;
	struct perf_event		*parent;

	int				oncpu;
	int				cpu;

	struct list_head		owner_entry;
	struct task_struct		*owner;

	/* mmap bits */
	struct mutex			mmap_mutex;
	atomic_t			mmap_count;
	int				mmap_locked;
	struct user_struct		*mmap_user;
	struct perf_buffer		*buffer;

	/* poll related */
	wait_queue_head_t		waitq;
	struct fasync_struct		*fasync;

	/* delayed work for NMIs and such */
	int				pending_wakeup;
	int				pending_kill;
	int				pending_disable;
	struct irq_work			pending;

	atomic_t			event_limit;

	void (*destroy)(struct perf_event *);
	struct rcu_head			rcu_head;

	struct pid_namespace		*ns;
	u64				id;

	perf_overflow_handler_t		overflow_handler;

#ifdef CONFIG_EVENT_TRACING
	struct ftrace_event_call	*tp_event;
	struct event_filter		*filter;
#endif

#ifdef CONFIG_CGROUP_PERF
	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
	int				cgrp_defer_enabled;
#endif

#endif /* CONFIG_PERF_EVENTS */
};