【Linux】网络专题(三)——L4/L3数据通路

有了对sk_buffsocknet_device的了解,我们接着介绍传输层数据通路,这里将以TCP为例。本节只介绍数据通路,要讲协议本身又需要很大篇幅,需要另开一篇文章。

1. TCP协议层

1.1. tcp_sendmsg

TCP传输层的入口是tcp_sendmsg

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
	int ret;

	lock_sock(sk);
	ret = tcp_sendmsg_locked(sk, msg, size);
	release_sock(sk);

	return ret;
}

sock是套接字的网络层表示,是socket结构的成员;而socket则是BSD套接字。听起来有点迷惑。

简单的上锁之后,我们进入tcp_sendmsg_locked。它将待发送的数据msg拆分、安排至一个双向链表中,它由sock结构体的sk_write_queue进行维护,每个元素都是一个sk_buffsk_buff是一个骚气的数据结构,可以看作是网络栈各层次的通用包裹,包含各种各样的信息。各层都会创造自己的sk_buff,但里面包装的数据永远不会被重复拷贝。它作为网络栈的核心数据结构,值得一讲,不过这里我们只要理解它是数据缓冲区就好。

tcp_sendmsg_locked中有一个循环用来处理报文段和从高速缓存中分配skb,但是因为情况非常多,所以整个控制逻辑用了很多goto,显得十分心把。不论如何,TCP在循环中将报文段通过tcp_push传向下一层处理逻辑。

1.2. tcp_push

void tcp_push(struct sock *sk, int flags, int mss_now,
	      int nonagle, int size_goal)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

	skb = tcp_write_queue_tail(sk);
	if (!skb)
		return;
	if (!(flags & MSG_MORE) || forced_push(tp))
		tcp_mark_push(tp, skb);

	tcp_mark_urg(tp, flags);

	if (tcp_should_autocork(sk, skb, size_goal)) {

		/* avoid atomic op if TSQ_THROTTLED bit is already set */
		if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
			set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
		}
		/* It is possible TX completion already happened
		 * before we set TSQ_THROTTLED.
		 */
		if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
			return;
	}

	if (flags & MSG_MORE)
		nonagle = TCP_NAGLE_CORK;

	__tcp_push_pending_frames(sk, mss_now, nonagle);
}

tcp_push会先判断skb中的元素是否需要push,需要的话就为PSH置位。置位是在sk_buffcb域中做的,它是每一层的控制块,传输层会在控制块里塞入主机字节序的报文段头。

接下来,内核进入输出引擎部分。

1.3. TCP输出引擎

首先调用__tcp_push_pending_frames

/* Push out any pending frames which were held back due to
 * TCP_CORK or attempt at coalescing tiny packets.
 * The socket must be locked by the caller.
 */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle)
{
	/* If we are closed, the bytes will have to remain here.
	 * In time closedown will finish, we empty the write queue and
	 * all will be happy.
	 */
	if (unlikely(sk->sk_state == TCP_CLOSE))
		return;

	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
			   sk_gfp_mask(sk, GFP_ATOMIC)))
		tcp_check_probe_timer(sk);
}

这一步会判断状态,如果连接已经关闭了,就什么都不做;否则,调用tcp_write_xmit。按正常的流程,它会通过tcp_transmit_skb发送报文,并在tcp_event_new_data_sent中完成事后的一些事件,例如统计计数、启动重传超时计时器等:

		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
			break;

		tcp_event_new_data_sent(sk, skb);

tcp_transmit_skb会调用__tcp_transmit_skb,它执行最后的准备工作,例如计算校验和等,然后调用网络层接口。

static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	/* ... */
	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
			   tcp_v6_send_check, tcp_v4_send_check,
			   sk, skb);
	/* ... */
	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
				 inet6_csk_xmit, ip_queue_xmit,
				 sk, skb, &inet->cork.fl);
	/* ... */
}

2. IP协议层

这里的代码不得不说……嗯……有点抽象,接口也设计得比较奇怪。

2.1. ip_queue_xmit

ip_queue_xmit调用__ip_queue_xmit,后者会执行路由和IP头封装。

int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
		    __u8 tos)
{
	struct inet_sock *inet = inet_sk(sk);
	struct net *net = sock_net(sk);
	struct ip_options_rcu *inet_opt;
	struct flowi4 *fl4;
	struct rtable *rt;
	struct iphdr *iph;
	int res;

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
	rcu_read_lock();
	inet_opt = rcu_dereference(inet->inet_opt);
	fl4 = &fl->u.ip4;
	/* 获取路由缓存,获取到就可以直接跳转,否则需要查找路由表 */
	rt = skb_rtable(skb);
	if (rt)
		goto packet_routed;

	/* Make sure we can route this packet. */

	/* 从sock控制块中检查路由是否有效 */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
	if (!rt) {
		__be32 daddr;

		/* Use correct destination address if we have options. */
		daddr = inet->inet_daddr;
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;

		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */

		/* 查找路由 */
		rt = ip_route_output_ports(net, fl4, sk,
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
					   RT_CONN_FLAGS_TOS(sk, tos),
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;

		/* 设置控制块路由缓存 */
		sk_setup_caps(sk, &rt->dst);
	}

	/* 将路由设置到控制块 */
	skb_dst_set_noref(skb, &rt->dst);

packet_routed:
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */

	/* 构造IP头,设置选项 */
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
	iph->protocol = sk->sk_protocol;
	ip_copy_addrs(iph, fl4);

	/* Transport layer set skb->h.foo itself. */

	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
	}

	ip_select_ident_segs(net, skb, sk,
			     skb_shinfo(skb)->gso_segs ?: 1);

	/* TODO : should we use skb->sk here instead of sk ? */
	skb->priority = sk->sk_priority;
	skb->mark = sk->sk_mark;

	/* 输出 */
	res = ip_local_out(net, sk, skb);
	rcu_read_unlock();
	return res;

no_route:
	rcu_read_unlock();
	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
	kfree_skb(skb);
	return -EHOSTUNREACH;
}

这样三层数据报就产生了,接下来通过ip_local_out进行发送。

2.2. ip_local_out

到此,我们完全离开了传输层,并与UDP等协议产生的数据报胜利会师。

首先进行发送前检查,确认可以发送,则调用dst_output发送。

int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	int err;

	err = __ip_local_out(net, sk, skb);
	if (likely(err == 1))
		err = dst_output(net, sk, skb);

	return err;
}

2.3. 发送前检查:__ip_local_out

int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);

	/* if egress device is enslaved to an L3 master device pass the
	 * skb to its handler for processing
	 */
	skb = l3mdev_ip_out(sk, skb);
	if (unlikely(!skb))
		return 0;

	skb->protocol = htons(ETH_P_IP);

	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, sk,b, NULL, skb_dst(skb)->dev,
		       dst_output);
}

在该函数中进行最后的校验和计算,然后便调用nf_hook进入netfilter子系统,后者会处理NF_INET_LOCAL_OUT的hook函数,并确认是否允许数据报通过。nf_hook的返回值为1表示允许通过,标明调用者可以手动调用okfn函数。上例中即为dst_output

2.4. dst_output

/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	return skb_dst(skb)->output(net, sk, skb);
}

它查找到关联到该skb的dst条目,然后调用output方法。通常,该方法是ip_output

dst条目是dst_entry结构体,实现协议无关的目标缓存。对于TCP协议,它在ip_route_output_ports中被创建,后者进一步调用__mkroute_output来创建目标缓存条目。目标缓存和路由又是很大的内容,这里不再细讲。

2.5. ip_output

通过IP_UPD_PO_STATS更新统计计数,包括字节数和包数统计。接下来设置将发送此skb的设备和协议。

int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;

	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, indev, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
}

最后,通过调用NF_HOOK_COND将控制权转交给netfilter,这是一个函数而非宏。我们看看它的实现:

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
	     struct sk_buff *skb, struct net_device *in, struct net_device *out,
	     int (*okfn)(struct net *, struct sock *, struct sk_buff *),
	     bool cond)
{
	int ret;

	if (!cond ||
	    ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
		ret = okfn(net, sk, skb);
	return ret;
}

传入的条件cond表示是否可以直接通过,若否,则需要进入nf_hook,由NF_INET_POST_ROUTING链上的钩子函数进行裁决。当nf_hook允许手动执行okfn时,执行okfn。正常情况下okfn将会被执行,本例中为ip_finish_output

2.6. ip_finish_output

在进入BPF子系统进行一些判断后,函数进入__ip_finish_output,后者执行分片相关的判断。

static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	if (skb_dst(skb)->xfrm) {
		IPCB(skb)->flags |= IPSKB_REROUTED;
		return dst_output(net, sk, skb);
	}
#endif
	mtu = ip_skb_dst_mtu(sk, skb);
	if (skb_is_gso(skb))
		return ip_finish_output_gso(net, sk, skb, mtu);

	if (skb->len > mtu || IPCB(skb)->frag_max_size)
		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);

	return ip_finish_output2(net, sk, skb);
}

首先如果内核启用了netfilter和数据包转换(XFRM),就更新skb的标志,并通过dst_output发回。

更为常见的情况是:

  • 如果网卡开启了GSO,那么将数据报交给网卡,后者会进行分片等处理,最终调用ip_finish_output2
  • 否则,查看数据报长度,如果大于MTU,需要交给ip_fragment分片,后者会调用回调函数ip_finish_output2进行发送;
  • 否则,可以直接调用ip_finish_output2进行发送。

所以,ip_finish_output2时而是回调函数,时而不是,真是太奇怪了。通过grep命令查到所有ip_fragment的调用都是将ip_finish_output2作为回调函数。

2.7. ip_finish_output2

兜兜转转这么长时间我们终于到了IP协议层最终的出口。

static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	struct rtable *rt = (struct rtable *)dst;
	struct net_device *dev = dst->dev;
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
	struct neighbour *neigh;
	bool is_v6gw = false;

	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
	} else if (rt->rt_type == RTN_BROADCAST)
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);

	/* Be paranoid, rather than too clever. */
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
		if (!skb2) {
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		consume_skb(skb);
		skb = skb2;
	}


首先判断协议是否是多播或者广播,并增加其统计数据。其次要确保skb能够容纳下任何的链路层头,不够就调用skb_realloc_headroom分配一个新的skb,并释放原来的skb。不过这种情况不太容易发生。

	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
		int res = lwtunnel_xmit(skb);

		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
			return res;
	}

	rcu_read_lock_bh();
	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
	if (!IS_ERR(neigh)) {
		int res;

		sock_confirm_neigh(skb, neigh);
		/* if crossing protocols, can not use the cached header */
		res = neigh_output(neigh, skb, is_v6gw);
		rcu_read_unlock_bh();
		return res;
	}
	rcu_read_unlock_bh();

	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
	kfree_skb(skb);
	return -EINVAL;
}

下面通过ip_neigh_for_gw获取邻居缓存,ip_neigh_for_gw的逻辑是先查找,找不到则创建一个新的邻居,这种情况发生在第一次和某主机通信的时候。如果邻居获取成功,就调用neigh_output进行发送。

3. 邻居子系统

3.1. neigh_output

有两条线可供选择,一条是neigh_hh_output,另一条是n->output

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
			       bool skip_cache)
{
	const struct hh_cache *hh = &n->hh;

	/* n->nud_state and hh->hh_len could be changed under us.
	 * neigh_hh_output() is taking care of the race later.
	 */
	if (!skip_cache &&
	    (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
	    READ_ONCE(hh->hh_len))
		return neigh_hh_output(hh, skb);

	return n->output(n, skb);
}
  • !skip_cache:传入的参数是isv6gw,该参数在获取邻居时确定(ip_neigh_for_gw)。换言之,该条件在不采用IPv6协议时成立。
  • NUD_CONNECTED:满足以下三个条件之一:静态路由、不需要ARP请求(例如多播、广播、环回)、邻居可达(即已经成功处理ARP请求)。
  • hh->hh_len:“硬件头”长度,不为0说明已经被缓存。

当上面三个条件均成立时,调用neigh_hh_output,否则调用n->output。

3.2. neigh_hh_output

在发送到网络设备子系统前,将头数据复制到skb。

static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
	unsigned int hh_alen = 0;
	unsigned int seq;
	unsigned int hh_len;

	do {
		seq = read_seqbegin(&hh->hh_lock);
		hh_len = READ_ONCE(hh->hh_len);
		if (likely(hh_len <= HH_DATA_MOD)) {
			hh_alen = HH_DATA_MOD;

			/* skb_push() would proceed silently if we have room for
			 * the unaligned size but not for the aligned size:
			 * check headroom explicitly.
			 */
			if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
				/* this is inlined by gcc */
				memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
				       HH_DATA_MOD);
			}
		} else {
			hh_alen = HH_DATA_ALIGN(hh_len);

			if (likely(skb_headroom(skb) >= hh_alen)) {
				memcpy(skb->data - hh_alen, hh->hh_data,
				       hh_alen);
			}
		}
	} while (read_seqretry(&hh->hh_lock, seq));

	if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
		kfree_skb(skb);
		return NET_XMIT_DROP;
	}

	__skb_push(skb, hh_len);
	return dev_queue_xmit(skb);
}

我们在代码中看到了顺序锁seqlock,还记得吗?我们在内核同步中讲过。最核心的代码是这里:

		if (likely(hh_len <= HH_DATA_MOD)) {
			hh_alen = HH_DATA_MOD;


			if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
				/* this is inlined by gcc */
				memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
				       HH_DATA_MOD);
			}
		} else {
			hh_alen = HH_DATA_ALIGN(hh_len);

			if (likely(skb_headroom(skb) >= hh_alen)) {
				memcpy(skb->data - hh_alen, hh->hh_data,
				       hh_alen);
			}
		}

将缓存的头数据拷贝到skb;或者先对齐,再拷贝。

拷贝好后会用__skb_push更新skb内缓冲区指针,然后调用dev_queue_xmit进入网络设备子系统。

3.3. n->output

TODO…

它最终也会调用dev_queue_xmit

接下来我们进入到网络设备层,未完待续。

参考资料

TCP->IP输出 之 ip_queue_xmit、ip_build_and_send_pkt、ip_send_unicast_reply

发表评论

您的电子邮箱地址不会被公开。