有了对sk_buff、sock和net_device的了解,我们接着介绍传输层数据通路,这里将以TCP为例。本节只介绍数据通路,要讲协议本身又需要很大篇幅,需要另开一篇文章。
1. TCP协议层
1.1. tcp_sendmsg
TCP传输层的入口是tcp_sendmsg:
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
int ret;
lock_sock(sk);
ret = tcp_sendmsg_locked(sk, msg, size);
release_sock(sk);
return ret;
}
sock是套接字的网络层表示,是socket结构的成员;而socket则是BSD套接字。听起来有点迷惑。
简单的上锁之后,我们进入tcp_sendmsg_locked。它将待发送的数据msg拆分、安排至一个双向链表中,它由sock结构体的sk_write_queue进行维护,每个元素都是一个sk_buff。sk_buff是一个骚气的数据结构,可以看作是网络栈各层次的通用包裹,包含各种各样的信息。各层都会创造自己的sk_buff,但里面包装的数据永远不会被重复拷贝。它作为网络栈的核心数据结构,值得一讲,不过这里我们只要理解它是数据缓冲区就好。
tcp_sendmsg_locked中有一个循环用来处理报文段和从高速缓存中分配skb,但是因为情况非常多,所以整个控制逻辑用了很多goto,显得十分心把。不论如何,TCP在循环中将报文段通过tcp_push传向下一层处理逻辑。
1.2. tcp_push
void tcp_push(struct sock *sk, int flags, int mss_now,
int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
skb = tcp_write_queue_tail(sk);
if (!skb)
return;
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
tcp_mark_urg(tp, flags);
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
*/
if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
return;
}
if (flags & MSG_MORE)
nonagle = TCP_NAGLE_CORK;
__tcp_push_pending_frames(sk, mss_now, nonagle);
}
tcp_push会先判断skb中的元素是否需要push,需要的话就为PSH置位。置位是在sk_buff的cb域中做的,它是每一层的控制块,传输层会在控制块里塞入主机字节序的报文段头。
接下来,内核进入输出引擎部分。
1.3. TCP输出引擎
首先调用__tcp_push_pending_frames:
/* Push out any pending frames which were held back due to
* TCP_CORK or attempt at coalescing tiny packets.
* The socket must be locked by the caller.
*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
这一步会判断状态,如果连接已经关闭了,就什么都不做;否则,调用tcp_write_xmit。按正常的流程,它会通过tcp_transmit_skb发送报文,并在tcp_event_new_data_sent中完成事后的一些事件,例如统计计数、启动重传超时计时器等:
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
tcp_event_new_data_sent(sk, skb);
tcp_transmit_skb会调用__tcp_transmit_skb,它执行最后的准备工作,例如计算校验和等,然后调用网络层接口。
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
/* ... */
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
/* ... */
err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
inet6_csk_xmit, ip_queue_xmit,
sk, skb, &inet->cork.fl);
/* ... */
}
2. IP协议层
这里的代码不得不说……嗯……有点抽象,接口也设计得比较奇怪。
2.1. ip_queue_xmit
ip_queue_xmit调用__ip_queue_xmit,后者会执行路由和IP头封装。
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
/* 获取路由缓存,获取到就可以直接跳转,否则需要查找路由表 */
rt = skb_rtable(skb);
if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
/* 从sock控制块中检查路由是否有效 */
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (!rt) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet->inet_daddr;
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
/* 查找路由 */
rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
/* 设置控制块路由缓存 */
sk_setup_caps(sk, &rt->dst);
}
/* 将路由设置到控制块 */
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
/* 构造IP头,设置选项 */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb->h.foo itself. */
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
/* 输出 */
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
这样三层数据报就产生了,接下来通过ip_local_out进行发送。
2.2. ip_local_out
到此,我们完全离开了传输层,并与UDP等协议产生的数据报胜利会师。
首先进行发送前检查,确认可以发送,则调用dst_output发送。
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
2.3. 发送前检查:__ip_local_out
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_out(sk, skb);
if (unlikely(!skb))
return 0;
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, sk,b, NULL, skb_dst(skb)->dev,
dst_output);
}
在该函数中进行最后的校验和计算,然后便调用nf_hook进入netfilter子系统,后者会处理NF_INET_LOCAL_OUT的hook函数,并确认是否允许数据报通过。nf_hook的返回值为1表示允许通过,标明调用者可以手动调用okfn函数。上例中即为dst_output。
2.4. dst_output
/* Output packet to network from transport. */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return skb_dst(skb)->output(net, sk, skb);
}
它查找到关联到该skb的dst条目,然后调用output方法。通常,该方法是ip_output。
dst条目是dst_entry结构体,实现协议无关的目标缓存。对于TCP协议,它在ip_route_output_ports中被创建,后者进一步调用__mkroute_output来创建目标缓存条目。目标缓存和路由又是很大的内容,这里不再细讲。
2.5. ip_output
通过IP_UPD_PO_STATS更新统计计数,包括字节数和包数统计。接下来设置将发送此skb的设备和协议。
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
最后,通过调用NF_HOOK_COND将控制权转交给netfilter,这是一个函数而非宏。我们看看它的实现:
static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
struct sk_buff *skb, struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *),
bool cond)
{
int ret;
if (!cond ||
((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
ret = okfn(net, sk, skb);
return ret;
}
传入的条件cond表示是否可以直接通过,若否,则需要进入nf_hook,由NF_INET_POST_ROUTING链上的钩子函数进行裁决。当nf_hook允许手动执行okfn时,执行okfn。正常情况下okfn将会被执行,本例中为ip_finish_output。
2.6. ip_finish_output
在进入BPF子系统进行一些判断后,函数进入__ip_finish_output,后者执行分片相关的判断。
static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(sk, skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
}
首先如果内核启用了netfilter和数据包转换(XFRM),就更新skb的标志,并通过dst_output发回。
更为常见的情况是:
- 如果网卡开启了GSO,那么将数据报交给网卡,后者会进行分片等处理,最终调用
ip_finish_output2; - 否则,查看数据报长度,如果大于MTU,需要交给
ip_fragment分片,后者会调用回调函数ip_finish_output2进行发送; - 否则,可以直接调用
ip_finish_output2进行发送。
所以,ip_finish_output2时而是回调函数,时而不是,真是太奇怪了。通过grep命令查到所有ip_fragment的调用都是将ip_finish_output2作为回调函数。
2.7. ip_finish_output2
兜兜转转这么长时间我们终于到了IP协议层最终的出口。
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (!skb2) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
首先判断协议是否是多播或者广播,并增加其统计数据。其次要确保skb能够容纳下任何的链路层头,不够就调用skb_realloc_headroom分配一个新的skb,并释放原来的skb。不过这种情况不太容易发生。
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
if (res < 0 || res == LWTUNNEL_XMIT_DONE)
return res;
}
rcu_read_lock_bh();
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
sock_confirm_neigh(skb, neigh);
/* if crossing protocols, can not use the cached header */
res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
下面通过ip_neigh_for_gw获取邻居缓存,ip_neigh_for_gw的逻辑是先查找,找不到则创建一个新的邻居,这种情况发生在第一次和某主机通信的时候。如果邻居获取成功,就调用neigh_output进行发送。
3. 邻居子系统
3.1. neigh_output
有两条线可供选择,一条是neigh_hh_output,另一条是n->output。
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
bool skip_cache)
{
const struct hh_cache *hh = &n->hh;
/* n->nud_state and hh->hh_len could be changed under us.
* neigh_hh_output() is taking care of the race later.
*/
if (!skip_cache &&
(READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
READ_ONCE(hh->hh_len))
return neigh_hh_output(hh, skb);
return n->output(n, skb);
}
!skip_cache:传入的参数是isv6gw,该参数在获取邻居时确定(ip_neigh_for_gw)。换言之,该条件在不采用IPv6协议时成立。NUD_CONNECTED:满足以下三个条件之一:静态路由、不需要ARP请求(例如多播、广播、环回)、邻居可达(即已经成功处理ARP请求)。hh->hh_len:“硬件头”长度,不为0说明已经被缓存。
当上面三个条件均成立时,调用neigh_hh_output,否则调用n->output。
3.2. neigh_hh_output
在发送到网络设备子系统前,将头数据复制到skb。
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
unsigned int hh_alen = 0;
unsigned int seq;
unsigned int hh_len;
do {
seq = read_seqbegin(&hh->hh_lock);
hh_len = READ_ONCE(hh->hh_len);
if (likely(hh_len <= HH_DATA_MOD)) {
hh_alen = HH_DATA_MOD;
/* skb_push() would proceed silently if we have room for
* the unaligned size but not for the aligned size:
* check headroom explicitly.
*/
if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
/* this is inlined by gcc */
memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
HH_DATA_MOD);
}
} else {
hh_alen = HH_DATA_ALIGN(hh_len);
if (likely(skb_headroom(skb) >= hh_alen)) {
memcpy(skb->data - hh_alen, hh->hh_data,
hh_alen);
}
}
} while (read_seqretry(&hh->hh_lock, seq));
if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
kfree_skb(skb);
return NET_XMIT_DROP;
}
__skb_push(skb, hh_len);
return dev_queue_xmit(skb);
}
我们在代码中看到了顺序锁seqlock,还记得吗?我们在内核同步中讲过。最核心的代码是这里:
if (likely(hh_len <= HH_DATA_MOD)) {
hh_alen = HH_DATA_MOD;
if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
/* this is inlined by gcc */
memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
HH_DATA_MOD);
}
} else {
hh_alen = HH_DATA_ALIGN(hh_len);
if (likely(skb_headroom(skb) >= hh_alen)) {
memcpy(skb->data - hh_alen, hh->hh_data,
hh_alen);
}
}
将缓存的头数据拷贝到skb;或者先对齐,再拷贝。
拷贝好后会用__skb_push更新skb内缓冲区指针,然后调用dev_queue_xmit进入网络设备子系统。
3.3. n->output
TODO…
它最终也会调用dev_queue_xmit。
接下来我们进入到网络设备层,未完待续。
参考资料
TCP->IP输出 之 ip_queue_xmit、ip_build_and_send_pkt、ip_send_unicast_reply