试试用 shell 脚本来实现一个 CNI(bridge).

Why?

  • 好玩
  • k8s bridge CNI 的设计非常地通用和简单
  • 不用编译, 随时 debug!

How?

使用 Linux 下的命令行工具, 来实现官方实现下的 bridge 插件的(大部分)功能.

requirements

common

  • bash - 执行
  • ip - 大部分对网络的操作都使用了该命令
  • jq - 命令行解析 json, 似乎没什么选择…
  • ln - 软链接 netns 文件
  • arping - 给容器 IP 发送免费的 arp

Bridge

  • brctl - 仅仅用到了 brctl hairpin <bridge> <port> {on|off}. yum(apt) install bridge-utils. 实现发卡弯模式

Start

脚本实现和 golang 实现差不多, 主要分为 command_add / command_del / command_version. 下面主要仅仅介绍 command_add 的实现过程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
command_add() {
if [ "${IS_DEFAULT_GATEWAY}" == "true" ]; then
IS_GATEWAY="true"
fi

# 检查配置, 主要是混杂模式和发卡弯模式不能同时开启
check_config

# 如果没有 br0, 则创建
setup_bridge

# 将类似 /proc/49875/ns/net 建立软链接到 /var/run/netns 下, 给 ip 命令使用
setup_netns

# 随机的 veth 名称
host_veth_name="${VETH_PREFIX}`cat /dev/urandom | tr -cd 'a-f0-9' | head -c 8`"

# 在容器 netns 中创建 veth(因为容器一侧的永远叫 eth0, 不能在主机的 netns 中创建)
setup_veth ${host_veth_name}

# 根据 IPAM 获取 IP, 结果放在 IPAM_RESULT 中
exec_ipam

# TODO, 检查结果是否格式正确
check_ipam_result

# 给容器内的 eth0 配置 IP/route, 并发送免费 arp
config_container_veth

# TODO
if [ "${IS_GATEWAY}" == "true" ]; then
config_gateway
fi

# TODO IP MASQ
if [ "${IP_MASQ}" == "true" ]; then
config_ip_masq
fi

# 输到 stderr 的可以在 kubelet 日志中看到
echo -e ${IPAM_RESULT} >&2

# 输出结果到 stdin
echo -e ${IPAM_RESULT}
}

说明:

  1. 从 env 中获取 kubelet 需要 CNI 的参数, 从 stdin 中读取 CNI 的配置
  2. 如果 bridge 不存在则创建
  3. 需要给 netns 创建软连接到 /var/run/netns, 不然就需要 nsenter 来进入容器的 namespace
  4. 在容器的 netns 下创建 veth, 因为给容器的 peer 都叫 eth0, 所以不能在主机的 netns 下创建, 并将主机侧的 veth 移出, 连接到 bridge
  5. 从 .ipam.type 执行得到 IP/Route/DNS 等信息, 给容器侧的 eth0 配置
  6. 可选地配置 bridge 的地址为网关, 以及配置 iptables 的 MASQ
  7. 输出结果到 stdout

Env and stdin

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# CNI.spec 规定的环境变量
CNI_COMMAND=${CNI_COMMAND:-"VERSION"}
CNI_CONTAINERID=${CNI_CONTAINERID:-""}
CNI_NETNS=${CNI_NETNS:-""}
CNI_IFNAME=${CNI_IFNAME:-""}
CNI_ARGS=${CNI_ARGS:-""}
CNI_PATH=${CNI_PATH:-""}

# 从 stdin 获取配置. 参考
# https://github.com/containernetworking/plugins/blob/master/plugins/main/bridge/README.md
INPUT=`cat -`
BRIDGE_NAME=`echo $INPUT | jq -r '.bridge'`
IS_GATEWAY=`echo $INPUT | jq -r '.isGateway'`
IS_DEFAULT_GATEWAY=`echo $INPUT | jq -r '.isDefaultGateway'`
FORCE_ADDRESS=`echo $INPUT | jq -r '.forceAddress'`
IP_MASQ=`echo $INPUT | jq -r '.ipMasq'`
HAIRPIN_MODE=`echo $INPUT | jq -r '.hairpinMode'`
PROMISC_MODE=`echo $INPUT | jq -r '.promiscMode'`
MTU=`echo $INPUT | jq -r '.mtu'`
IPAM_TYPE=`echo $INPUT | jq -r '.ipam.type'`

check_config/检查 stdin 的配置

1
2
3
4
5
6
# 主要是 混杂模式和发卡弯模式不能同时开启
check_config() {
if [ "${PROMISC_MODE}" == "true" ] && [ "${HAIRPIN_MODE}" == "true" ]; then
Fatal "cannot set hairpin mode and promiscous mode at the same time."
fi
}

setup_bridge/创建并设置网桥

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# env: ${BRIDGE_NAME} ${MTU} ${PROMISC_MODE}
setup_bridge() {
# test exist
ip_link=`ip link show ${BRIDGE_NAME}`
if [ $? -ne 0 ]; then
ip link add ${BRIDGE_NAME} mtu ${MTU} txqueuelen -1 type bridge
fi

# 根据配置是否打开混杂模式
if [ "${PROMISC_MODE}" == "true" ]; then
ip link set dev ${BRIDGE_NAME} promisc on
else
ip link set dev ${BRIDGE_NAME} promisc off
fi
}

setup_netns/设置netns

1
2
3
4
5
6
7
8
9
10
# 主要是因为 ip 读取的 netns 是位于 /var/run/netns 下.
# 这里创建了以 ${CNI_CONTAINERID} 为名的 netns
setup_netns() {
mkdir -p /var/run/netns/
ln -sfT ${CNI_NETNS} /var/run/netns/${CNI_CONTAINERID}
}

cleanup_netns() {
rm -rf /var/run/netns/${CNI_CONTAINERID}
}

generate_veth_peer_name/随机 veth name

1
2
3
generate_veth_peer_name() {
echo "${VETH_PREFIX}`cat /dev/urandom | tr -cd 'a-f0-9' | head -c 8`"
}

setup_veth/创建并设置 veth peer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# env: ${CNI_IFNAME}
# arg: $host_veth_name
setup_veth() {
host_veth_name=${1}

# 在 ${CNI_CONTAINERID} 的 netns 下创建 veth. 因为容器内的 veth name 都是 eth0, 所以不能都在 host netns 下创建.
ip netns exec ${CNI_CONTAINERID} \
ip link add \
dev ${host_veth_name} up \
mtu ${MTU} \
type veth \
peer name ${CNI_IFNAME}

# 将 ${host_veth_name} 移到 host 一侧的 netns
ip netns exec ${CNI_CONTAINERID} \
ip link set dev ${host_veth_name} netns 1

# 启动 host 侧的 veth
ip link set ${host_veth_name} up

# 将 host 侧的 veth 连接到 br0 网桥
# ==: brctl addif ${BRIDGE_NAME} ${host_veth_name}
ip link set ${host_veth_name} master ${BRIDGE_NAME}

# 选择是否配置发卡弯模式
if [ "${HAIRPIN_MODE}" == "true" ]; then
brctl hairpin ${BRIDGE_NAME} ${host_veth_name} on
fi
}

exec_ipam/执行 ipam 的二进制文件, 获取 IP

1
2
3
4
5
6
7
8
exec_ipam() {
for p in `echo ${CNI_PATH} | sed "s/:/ /g"`; do
if [ -f "${p}/${IPAM_TYPE}" ]; then
IPAM_RESULT=`echo -e ${INPUT} | ${p}/${IPAM_TYPE}`
return
fi
done
}

config_container_veth/根据 IPAM 的结果配置容器内的 veth

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# env: ${CNI_IFNAME} ${IPAM_RESULT}
config_container_veth() {
# set up
ip netns exec ${CNI_CONTAINERID} \
ip link set ${CNI_IFNAME} up

# ip: {"version":"4","address":"192.168.88.59/16","gateway":"192.168.1.1"}
for ip in `echo -e ${IPAM_RESULT} | jq -r --indent 0 '.ips[]'`; do
version=`echo ${ip} | jq -r '.version'`
if [ "${version}" != "4" ]; then
continue
fi

address=`echo ${ip} | jq -r '.address'`
gateway=`echo ${ip} | jq -r '.gateway'`

# add ip address
ip netns exec ${CNI_CONTAINERID} \
ip addr add ${address} dev ${CNI_IFNAME} 1>&2
done

# route: {"gw":"192.168.1.1","dst":"0.0.0.0/0"}
for route in `echo -e ${IPAM_RESULT} | jq -r --indent 0 '.routes[]'`; do
gw=`echo ${route} | jq -r '.gw'`
dst=`echo ${route} | jq -r '.dst'`

# add route
ip netns exec ${CNI_CONTAINERID} \
ip route add ${dst} via ${gw} 1>&2
done

# arp
for ip in `echo -e ${IPAM_RESULT} | jq -r --indent 0 '.ips[]'`; do
version=`echo ${ip} | jq -r '.version'`
if [ "${version}" != "4" ]; then
continue
fi

address=`echo ${ip} | jq -r '.address'`

# sends an gratuitous arp. ${address%/*} for 192.168.1.100/16 -> 192.168.1.100
ip netns exec ${CNI_CONTAINERID} \
arping -c 4 -A -I ${CNI_IFNAME} ${address%/*} 1>&2
done
}

config_gateway/配置到 br0 的默认路由

该部分逻辑在 ipam 返回所有路由信息的情况下, 是不需要的…所以留了 TODO.

1
2
3
config_gateway() {
echo "TODO"
}

config_ip_masq/配置 iptables NAT 表

当配置和主机同网段 IP 时候, 也不需要该部分配置…也留个 TODO 吧.

1
2
3
config_ip_masq() {
echo "TODO"
}

Talk is cheap, show me the code.

pikeszfish/shell-plugin-for-cni

Reference

containernetworking/plugins/plugins/main/bridge/README.md

Overview

With bridge plugin, all containers (on the same host) are plugged into a bridge (virtual switch) that resides in the host network namespace.
The containers receive one end of the veth pair with the other end connected to the bridge.
An IP address is only assigned to one end of the veth pair – one residing in the container.
The bridge itself can also be assigned an IP address, turning it into a gateway for the containers.
Alternatively, the bridge can function purely in L2 mode and would need to be bridged to the host network interface (if other than container-to-container communication on the same host is desired).

The network configuration specifies the name of the bridge to be used.
If the bridge is missing, the plugin will create one on first use and, if gateway mode is used, assign it an IP that was returned by IPAM plugin via the gateway field.

Example configuration

1
2
3
4
5
6
7
8
9
10
11
12
13
{
"name": "mynet",
"type": "bridge",
"bridge": "mynet0",
"isDefaultGateway": true,
"forceAddress": false,
"ipMasq": true,
"hairpinMode": true,
"ipam": {
"type": "host-local",
"subnet": "10.10.0.0/16"
}
}

Network configuration reference

  • name (string, required): the name of the network.
  • type (string, required): “bridge”.
  • bridge (string, optional): name of the bridge to use/create. Defaults to “cni0”.
  • isGateway (boolean, optional): assign an IP address to the bridge. Defaults to false.
  • isDefaultGateway (boolean, optional): Sets isGateway to true and makes the assigned IP the default route. Defaults to false.
  • forceAddress (boolean, optional): Indicates if a new IP address should be set if the previous value has been changed. Defaults to false.
  • ipMasq (boolean, optional): set up IP Masquerade on the host for traffic originating from this network and destined outside of it. Defaults to false.
  • mtu (integer, optional): explicitly set MTU to the specified value. Defaults to the value chosen by the kernel.
  • hairpinMode (boolean, optional): set hairpin mode for interfaces on the bridge. Defaults to false.
  • ipam (dictionary, required): IPAM configuration to be used for this network.
  • promiscMode (boolean, optional): set promiscuous mode on the bridge. Defaults to false.