Skip to content

Commit 1ba8194

Browse files
author
Flavio Crisciani
authored
Merge pull request #2071 from fcrisciani/ssd
Import the ssd tool in libnetwork
2 parents c592ee4 + 2c34fe2 commit 1ba8194

File tree

3 files changed

+261
-0
lines changed

3 files changed

+261
-0
lines changed

cmd/ssd/Dockerfile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM alpine:3.7
2+
ENV PACKAGES="\
3+
musl \
4+
linux-headers \
5+
build-base \
6+
util-linux \
7+
bash \
8+
git \
9+
ca-certificates \
10+
python2 \
11+
python2-dev \
12+
py-setuptools \
13+
iproute2 \
14+
curl \
15+
strace \
16+
drill \
17+
ipvsadm \
18+
iperf \
19+
ethtool \
20+
"
21+
22+
RUN echo \
23+
&& apk add --no-cache $PACKAGES \
24+
&& if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python2.7 /usr/bin/python; fi \
25+
&& if [[ ! -e /usr/bin/python-config ]]; then ln -sf /usr/bin/python2.7-config /usr/bin/python-config; fi \
26+
&& if [[ ! -e /usr/bin/easy_install ]]; then ln -sf /usr/bin/easy_install-2.7 /usr/bin/easy_install; fi \
27+
&& easy_install pip \
28+
&& pip install --upgrade pip \
29+
&& if [[ ! -e /usr/bin/pip ]]; then ln -sf /usr/bin/pip2.7 /usr/bin/pip; fi \
30+
&& echo
31+
32+
ADD ssd.py /
33+
RUN pip install git+git://github.com/docker/docker-py.git
34+
ENTRYPOINT [ "python", "/ssd.py"]

cmd/ssd/README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Docker Swarm Service Driller(ssd)
2+
3+
ssd is a troubleshooting utility for Docker swarm networks.
4+
5+
### control-plane and datapath consistency check on a node
6+
ssd checks for the consistency between docker network control-plane (from the docker daemon in-memory state) and kernel data path programming. Currently the tool checks only for the consistency of the Load balancer (implemented using IPVS).
7+
8+
In a three node swarm cluser ssd status for a overlay network `ov2` which has three services running, each replicated to 3 instances.
9+
10+
````bash
11+
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged --net=host sanimej/ssd ov2
12+
Verifying LB programming for containers on network ov2
13+
Verifying container /s2.3.ltrdwef0iqf90rqauw3ehcs56...
14+
service s2... OK
15+
service s3... OK
16+
service s1... OK
17+
Verifying container /s3.3.nyhwvdvnocb4wftyhb8dr4fj8...
18+
service s2... OK
19+
service s3... OK
20+
service s1... OK
21+
Verifying container /s1.3.wwx5tuxhnvoz5vrb8ohphby0r...
22+
service s2... OK
23+
service s3... OK
24+
service s1... OK
25+
Verifying LB programming for containers on network ingress
26+
Verifying container Ingress...
27+
service web... OK
28+
````
29+
30+
ssd checks the required iptables programming to direct an incoming packet with the <host ip>:<published port> to the right <backend ip>:<target port>
31+
32+
### control-plane consistency check across nodes in a cluster
33+
34+
Docker networking uses a gossip protocol to synchronize networking state across nodes in a cluster. ssd's `gossip-consistency` command verifies if the state maintained by all the nodes are consistent.
35+
36+
````bash
37+
In a three node cluster with services running on an overlay network ov2 ssd consistency-checker shows
38+
39+
vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged sanimej/ssd ov2 gossip-consistency
40+
Node id: sjfp0ca8f43rvnab6v7f21gq0 gossip hash c57d89094dbb574a37930393278dc282
41+
42+
Node id: bg228r3q9095grj4wxkqs80oe gossip hash c57d89094dbb574a37930393278dc282
43+
44+
Node id: 6jylcraipcv2pxdricqe77j5q gossip hash c57d89094dbb574a37930393278dc282
45+
````
46+
47+
This is hash digest of the control-plane state for the network `ov2` from all the cluster nodes. If the values have a mismatch `docker network inspect --verbose` on the individual nodes can help in identifying what the specific difference is.

cmd/ssd/ssd.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/python
2+
3+
import sys, signal, time
4+
import docker
5+
import re
6+
import subprocess
7+
import json
8+
import hashlib
9+
10+
ipv4match = re.compile(
11+
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
12+
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
13+
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
14+
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9])'
15+
)
16+
17+
def check_iptables(name, plist):
18+
replace = (':', ',')
19+
ports = []
20+
for port in plist:
21+
for r in replace:
22+
port = port.replace(r, ' ')
23+
24+
p = port.split()
25+
ports.append((p[1], p[3]))
26+
27+
# get the ingress sandbox's docker_gwbridge network IP.
28+
# published ports get DNAT'ed to this IP.
29+
ip = subprocess.check_output(['/usr/bin/nsenter', '--net=/var/run/docker/netns/ingress_sbox', '/bin/bash', '-c', 'ifconfig eth1 | grep \"inet\\ addr\" | cut -d: -f2 | cut -d\" \" -f1'])
30+
ip = ip.rstrip()
31+
32+
for p in ports:
33+
rule = '/sbin/iptables -t nat -C DOCKER-INGRESS -p tcp --dport {0} -j DNAT --to {1}:{2}'.format(p[1], ip, p[1])
34+
try:
35+
subprocess.check_output(["/bin/bash", "-c", rule])
36+
except subprocess.CalledProcessError as e:
37+
print "Service {0}: host iptables DNAT rule for port {1} -> ingress sandbox {2}:{3} missing".format(name, p[1], ip, p[1])
38+
39+
def get_namespaces(data, ingress=False):
40+
if ingress is True:
41+
return {"Ingress":"/var/run/docker/netns/ingress_sbox"}
42+
else:
43+
spaces =[]
44+
for c in data["Containers"]:
45+
sandboxes = {str(c) for c in data["Containers"]}
46+
47+
containers = {}
48+
for s in sandboxes:
49+
spaces.append(str(cli.inspect_container(s)["NetworkSettings"]["SandboxKey"]))
50+
inspect = cli.inspect_container(s)
51+
containers[str(inspect["Name"])] = str(inspect["NetworkSettings"]["SandboxKey"])
52+
return containers
53+
54+
55+
def check_network(nw_name, ingress=False):
56+
57+
print "Verifying LB programming for containers on network %s" % nw_name
58+
59+
data = cli.inspect_network(nw_name, verbose=True)
60+
61+
services = data["Services"]
62+
fwmarks = {str(service): str(svalue["LocalLBIndex"]) for service, svalue in services.items()}
63+
64+
stasks = {}
65+
for service, svalue in services.items():
66+
if service == "":
67+
continue
68+
tasks = []
69+
for task in svalue["Tasks"]:
70+
tasks.append(str(task["EndpointIP"]))
71+
stasks[fwmarks[str(service)]] = tasks
72+
73+
# for services in ingress network verify the iptables rules
74+
# that direct ingress (published port) to backend (target port)
75+
if ingress is True:
76+
check_iptables(service, svalue["Ports"])
77+
78+
containers = get_namespaces(data, ingress)
79+
for container, namespace in containers.items():
80+
print "Verifying container %s..." % container
81+
ipvs = subprocess.check_output(['/usr/bin/nsenter', '--net=%s' % namespace, '/usr/sbin/ipvsadm', '-ln'])
82+
83+
mark = ""
84+
realmark = {}
85+
for line in ipvs.splitlines():
86+
if "FWM" in line:
87+
mark = re.findall("[0-9]+", line)[0]
88+
realmark[str(mark)] = []
89+
elif "->" in line:
90+
if mark == "":
91+
continue
92+
ip = ipv4match.search(line)
93+
if ip is not None:
94+
realmark[mark].append(format(ip.group(0)))
95+
else:
96+
mark = ""
97+
for key in realmark.keys():
98+
if key not in stasks:
99+
print "LB Index %s" % key, "present in IPVS but missing in docker daemon"
100+
del realmark[key]
101+
102+
for key in stasks.keys():
103+
if key not in realmark:
104+
print "LB Index %s" % key, "present in docker daemon but missing in IPVS"
105+
del stasks[key]
106+
107+
for key in realmark:
108+
service = "--Invalid--"
109+
for sname, idx in fwmarks.items():
110+
if key == idx:
111+
service = sname
112+
if len(set(realmark[key])) != len(set(stasks[key])):
113+
print "Incorrect LB Programming for service %s" % service
114+
print "control-plane backend tasks:"
115+
for task in stasks[key]:
116+
print task
117+
print "kernel IPVS backend tasks:"
118+
for task in realmark[key]:
119+
print task
120+
else:
121+
print "service %s... OK" % service
122+
123+
if __name__ == '__main__':
124+
if len(sys.argv) < 2:
125+
print 'Usage: ssd.py network-name [gossip-consistency]'
126+
sys.exit()
127+
128+
cli = docker.APIClient(base_url='unix://var/run/docker.sock', version='auto')
129+
if len(sys.argv) == 3:
130+
command = sys.argv[2]
131+
else:
132+
command = 'default'
133+
134+
if command == 'gossip-consistency':
135+
cspec = docker.types.ContainerSpec(
136+
image='sanimej/ssd',
137+
args=[sys.argv[1], 'gossip-hash'],
138+
mounts=[docker.types.Mount('/var/run/docker.sock', '/var/run/docker.sock', type='bind')]
139+
)
140+
mode = docker.types.ServiceMode(
141+
mode='global'
142+
)
143+
task_template = docker.types.TaskTemplate(cspec)
144+
145+
cli.create_service(task_template, name='gossip-hash', mode=mode)
146+
#TODO change to a deterministic way to check if the service is up.
147+
time.sleep(5)
148+
output = cli.service_logs('gossip-hash', stdout=True, stderr=True, details=True)
149+
for line in output:
150+
print("Node id: %s gossip hash %s" % (line[line.find("=")+1:line.find(",")], line[line.find(" ")+1:]))
151+
if cli.remove_service('gossip-hash') is not True:
152+
print("Deleting gossip-hash service failed")
153+
elif command == 'gossip-hash':
154+
data = cli.inspect_network(sys.argv[1], verbose=True)
155+
services = data["Services"]
156+
md5 = hashlib.md5()
157+
entries = []
158+
for service, value in services.items():
159+
entries.append(service)
160+
entries.append(value["VIP"])
161+
for task in value["Tasks"]:
162+
for key, val in task.items():
163+
if isinstance(val, dict):
164+
for k, v in val.items():
165+
entries.append(v)
166+
else:
167+
entries.append(val)
168+
entries.sort()
169+
for e in entries:
170+
md5.update(e)
171+
print(md5.hexdigest())
172+
sys.stdout.flush()
173+
while True:
174+
signal.pause()
175+
elif command == 'default':
176+
if sys.argv[1] == "ingress":
177+
check_network("ingress", ingress=True)
178+
else:
179+
check_network(sys.argv[1])
180+
check_network("ingress", ingress=True)

0 commit comments

Comments
 (0)