Skip to content

Commit ec1598e

Browse files
committed
Adding fault injection integ tests
1 parent 79f17a5 commit ec1598e

File tree

2 files changed

+378
-1
lines changed

2 files changed

+378
-1
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ run-integ-tests: test-registry gremlin start-ebs-csi-driver container-health-che
181181
$(MAKE) stop-ebs-csi-driver
182182

183183
run-sudo-tests:
184-
sudo -E ${GOTEST} -tags sudo -timeout=10m ./agent/...
184+
sudo -E ${GOTEST} -tags sudo -timeout=10m ./agent/... ./ecs-agent/...
185185

186186
run-sudo-unit-tests:
187187
sudo -E ${GOTEST} -tags 'sudo_unit' -timeout=60s ./agent/...
Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
//go:build linux && sudo
2+
// +build linux,sudo
3+
4+
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
7+
// not use this file except in compliance with the License. A copy of the
8+
// License is located at
9+
//
10+
// http://aws.amazon.com/apache2.0/
11+
//
12+
// or in the "license" file accompanying this file. This file is distributed
13+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14+
// express or implied. See the License for the specific language governing
15+
// permissions and limitations under the License.
16+
17+
package handlers
18+
19+
import (
20+
"bytes"
21+
"context"
22+
"encoding/json"
23+
"fmt"
24+
"net"
25+
"net/http"
26+
"os/exec"
27+
"testing"
28+
"time"
29+
30+
"github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/model/ecs"
31+
mock_metrics "github.com/aws/amazon-ecs-agent/ecs-agent/metrics/mocks"
32+
"github.com/aws/amazon-ecs-agent/ecs-agent/tmds/handlers/fault/v1/types"
33+
v2 "github.com/aws/amazon-ecs-agent/ecs-agent/tmds/handlers/v2"
34+
state "github.com/aws/amazon-ecs-agent/ecs-agent/tmds/handlers/v4/state"
35+
mock_state "github.com/aws/amazon-ecs-agent/ecs-agent/tmds/handlers/v4/state/mocks"
36+
"github.com/aws/amazon-ecs-agent/ecs-agent/tmds/utils/netconfig"
37+
"github.com/aws/amazon-ecs-agent/ecs-agent/utils/execwrapper"
38+
39+
"github.com/golang/mock/gomock"
40+
"github.com/gorilla/mux"
41+
"github.com/stretchr/testify/assert"
42+
"github.com/stretchr/testify/require"
43+
)
44+
45+
const (
46+
endpointId = "endpoint"
47+
taskARN = "t1"
48+
startEndpoint = "/api/%s/fault/v1/%s/start"
49+
stopEndpoint = "/api/%s/fault/v1/%s/stop"
50+
statusEndpoint = "/api/%s/fault/v1/%s/status"
51+
)
52+
53+
var (
54+
agentStateExpectations = func(t *testing.T, agentState *mock_state.MockAgentState, netConfigClient *netconfig.NetworkConfigClient) {
55+
taskResponse := getFaultInjectionTaskResponse(t, netConfigClient)
56+
agentState.EXPECT().GetTaskMetadataWithTaskNetworkConfig(gomock.Any(), netConfigClient).Return(taskResponse, nil).AnyTimes()
57+
}
58+
)
59+
60+
// Getter function to construct and return a task response with a task network config object
61+
// which is mainly used as the return value of a GetTaskMetadataWithTaskNetworkConfig call
62+
func getFaultInjectionTaskResponse(t *testing.T, netConfigClient *netconfig.NetworkConfigClient) state.TaskResponse {
63+
deviceName, err := getHostNetworkInterfaceName(netConfigClient)
64+
require.NoError(t, err)
65+
taskNetworkConfig := state.TaskNetworkConfig{
66+
NetworkMode: ecs.NetworkModeHost,
67+
NetworkNamespaces: []*state.NetworkNamespace{
68+
{
69+
Path: "/some/path",
70+
NetworkInterfaces: []*state.NetworkInterface{
71+
{
72+
DeviceName: deviceName,
73+
},
74+
},
75+
},
76+
},
77+
}
78+
taskResponse := state.TaskResponse{
79+
TaskResponse: &v2.TaskResponse{TaskARN: taskARN},
80+
TaskNetworkConfig: &taskNetworkConfig,
81+
FaultInjectionEnabled: true,
82+
}
83+
84+
return taskResponse
85+
}
86+
87+
// getHostNetworkInterfaceName obtains the default network interface name on the host via the DefaultNetInterfaceName method
88+
func getHostNetworkInterfaceName(netConfigClient *netconfig.NetworkConfigClient) (string, error) {
89+
deviceName, err := netconfig.DefaultNetInterfaceName(netConfigClient.NetlinkClient)
90+
return deviceName, err
91+
}
92+
93+
// Getter function to construct and return a request body for black hole port loss request
94+
func getNetworkBlackHolePortRequestBody(port int, protocol, trafficType string) map[string]interface{} {
95+
return map[string]interface{}{
96+
"Port": port,
97+
"Protocol": protocol,
98+
"TrafficType": trafficType,
99+
}
100+
}
101+
102+
// Getter function to construct and return a request body for network latency request
103+
func getNetworkLatencyRequestBody(delayMilliseconds, jitterMilliseconds int, ipSources, ipSourcesToFilter []string) map[string]interface{} {
104+
return map[string]interface{}{
105+
"DelayMilliseconds": delayMilliseconds,
106+
"JitterMilliseconds": jitterMilliseconds,
107+
"Sources": ipSources,
108+
"SourcesToFilter": ipSourcesToFilter,
109+
}
110+
}
111+
112+
// Getter function to construct and return a request body for network packet loss request
113+
func getNetworkPacketLossRequestBody(lossPercent int, ipSources, ipSourcesToFilter []string) map[string]interface{} {
114+
return map[string]interface{}{
115+
"LossPercent": lossPercent,
116+
"Sources": ipSources,
117+
"SourcesToFilter": ipSourcesToFilter,
118+
}
119+
}
120+
121+
// Helper function that starts a HTTP server as well as set up the fault injection handlers and required mocks
122+
func startServer(t *testing.T) (*http.Server, int) {
123+
// Mocks
124+
ctrl := gomock.NewController(t)
125+
defer ctrl.Finish()
126+
127+
agentState := mock_state.NewMockAgentState(ctrl)
128+
nc := netconfig.NewNetworkConfigClient()
129+
agentStateExpectations(t, agentState, nc)
130+
131+
metricsFactory := mock_metrics.NewMockEntryFactory(ctrl)
132+
router := mux.NewRouter()
133+
handler := New(agentState, metricsFactory, execwrapper.NewExec())
134+
135+
// Registering the network black hole port fault injection handlers
136+
router.HandleFunc(
137+
fmt.Sprintf(startEndpoint, endpointId, types.BlackHolePortFaultType),
138+
handler.StartNetworkBlackholePort(),
139+
).Methods(http.MethodPost)
140+
router.HandleFunc(
141+
fmt.Sprintf(stopEndpoint, endpointId, types.BlackHolePortFaultType),
142+
handler.StopNetworkBlackHolePort(),
143+
).Methods(http.MethodPost)
144+
router.HandleFunc(
145+
fmt.Sprintf(statusEndpoint, endpointId, types.BlackHolePortFaultType),
146+
handler.CheckNetworkBlackHolePort(),
147+
).Methods(http.MethodPost)
148+
149+
// Registering the network latency hole fault injection handlers
150+
router.HandleFunc(
151+
fmt.Sprintf(startEndpoint, endpointId, types.LatencyFaultType),
152+
handler.StartNetworkLatency(),
153+
).Methods(http.MethodPost)
154+
router.HandleFunc(
155+
fmt.Sprintf(stopEndpoint, endpointId, types.LatencyFaultType),
156+
handler.StopNetworkLatency(),
157+
).Methods(http.MethodPost)
158+
router.HandleFunc(
159+
fmt.Sprintf(statusEndpoint, endpointId, types.LatencyFaultType),
160+
handler.CheckNetworkLatency(),
161+
).Methods(http.MethodPost)
162+
163+
// Registering the network packet loss fault injection handlers
164+
router.HandleFunc(
165+
fmt.Sprintf(startEndpoint, endpointId, types.PacketLossFaultType),
166+
handler.StartNetworkPacketLoss(),
167+
).Methods(http.MethodPost)
168+
router.HandleFunc(
169+
fmt.Sprintf(stopEndpoint, endpointId, types.PacketLossFaultType),
170+
handler.StopNetworkPacketLoss(),
171+
).Methods(http.MethodPost)
172+
router.HandleFunc(
173+
fmt.Sprintf(statusEndpoint, endpointId, types.PacketLossFaultType),
174+
handler.CheckNetworkPacketLoss(),
175+
).Methods(http.MethodPost)
176+
177+
server := &http.Server{
178+
Addr: ":0", // Lets the system allocate an available port
179+
Handler: router,
180+
}
181+
182+
// Obtaining the port being used by the server
183+
listener, err := net.Listen("tcp", server.Addr)
184+
require.NoError(t, err)
185+
port := listener.Addr().(*net.TCPAddr).Port
186+
187+
go func() {
188+
if err := server.Serve(listener); err != nil && err != http.ErrServerClosed {
189+
t.Logf("ListenAndServe(): %s\n", err)
190+
require.NoError(t, err)
191+
}
192+
}()
193+
194+
return server, port
195+
}
196+
197+
// // Helper function to shut down a HTTP server
198+
func shutdownServer(t *testing.T, server *http.Server) {
199+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
200+
defer cancel()
201+
if err := server.Shutdown(ctx); err != nil {
202+
t.Logf("Server Shutdown Failed:%+v", err)
203+
} else {
204+
t.Logf("Server Exited Properly")
205+
}
206+
}
207+
208+
// Helper function that generates a local host URL with a certain port and path. This will be used to make HTTP requests to a HTTP server.
209+
func getURL(port int, path string) string {
210+
return "http://localhost:" + fmt.Sprintf("%d", port) + path
211+
}
212+
213+
// cleanupBlackHolePortFault will make requests to stop network black hole port requests within the server as a way to clean up after starting a black hole port fault injection on the host.
214+
func cleanupBlackHolePortFault(t *testing.T, serverPort int, body interface{}) {
215+
client := &http.Client{}
216+
217+
// Making a stop black hole port fault request to the server
218+
bodyBytes, _ := json.Marshal(body)
219+
bodyReader := bytes.NewReader(bodyBytes)
220+
req, err := http.NewRequest(http.MethodPost, getURL(serverPort, fmt.Sprintf(stopEndpoint, endpointId, types.BlackHolePortFaultType)), bodyReader)
221+
if err != nil {
222+
t.Logf("Unable to create stop black hole port request")
223+
}
224+
_, err = client.Do(req)
225+
if err != nil {
226+
t.Logf("Error occurred while cleaning up network black hole port fault injection: %v\n", err)
227+
}
228+
}
229+
230+
// cleanupLatencyAndPacketLossFaults will make requests to stop both network latency and network packet loss requests within the server as a way to clean up
231+
// after starting either a latency and/or packet loss fault injection on the host.
232+
func cleanupLatencyAndPacketLossFaults(t *testing.T, serverPort int) {
233+
client := &http.Client{}
234+
235+
// Making a stop network latency fault request to the server
236+
req1, err := http.NewRequest(http.MethodPost, getURL(serverPort, fmt.Sprintf(stopEndpoint, endpointId, "network-latency")), nil)
237+
if err != nil {
238+
t.Logf("Unable to create stop latency request")
239+
}
240+
_, err = client.Do(req1)
241+
if err != nil {
242+
t.Logf("Error occurred while cleaning up network latency fault injection: %v\n", err)
243+
}
244+
245+
// Making a stop network packet loss fault request to the server
246+
req2, err := http.NewRequest(http.MethodPost, getURL(serverPort, fmt.Sprintf(stopEndpoint, endpointId, "network-packet-loss")), nil)
247+
if err != nil {
248+
t.Logf("Unable to create stop packet loss request")
249+
}
250+
_, err = client.Do(req2)
251+
if err != nil {
252+
t.Logf("Error occurred while cleaning up network packet loss fault injection: %v\n", err)
253+
}
254+
}
255+
256+
// skipForUnsupportedTc will test whether or not the tc utility installed can use the required flag option(s) [e.g. -j] which is used to start/stop/check status of certain
257+
// network faults such as latency and packet loss.
258+
func skipForUnsupportedTc(t *testing.T) {
259+
cmdCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
260+
defer cancel()
261+
path, err := exec.LookPath("tc")
262+
if err != nil {
263+
t.Skipf("tc utility tool is not found on the host. Error: %v", err)
264+
}
265+
args := []string{"-j", "q", "show"}
266+
_, err = exec.CommandContext(cmdCtx, path, args[0:]...).CombinedOutput()
267+
if err != nil {
268+
t.Skipf("Current version of the tc utility does have the required flag/configuration. Error: %v", err)
269+
}
270+
}
271+
272+
func TestParallelNetworkFaults(t *testing.T) {
273+
tcs := []struct {
274+
name string
275+
faultType1 string
276+
faultType2 string
277+
body1 interface{}
278+
body2 interface{}
279+
responseCode1 int
280+
responseCode2 int
281+
}{
282+
{
283+
name: "network black hole port same type",
284+
faultType1: types.BlackHolePortFaultType,
285+
faultType2: types.BlackHolePortFaultType,
286+
body1: getNetworkBlackHolePortRequestBody(1234, "tcp", "ingress"),
287+
body2: getNetworkBlackHolePortRequestBody(4321, "tcp", "egress"),
288+
responseCode1: http.StatusOK,
289+
responseCode2: http.StatusOK,
290+
},
291+
{
292+
name: "network latency same type",
293+
faultType1: types.LatencyFaultType,
294+
faultType2: types.LatencyFaultType,
295+
body1: getNetworkLatencyRequestBody(100, 200, []string{"1.1.1.1", "2.2.2.2"}, []string{"8.8.8.8"}),
296+
body2: getNetworkLatencyRequestBody(200, 300, []string{"3.3.3.3", "4.4.4.4"}, []string{"9.9.9.9"}),
297+
responseCode1: http.StatusOK,
298+
responseCode2: http.StatusConflict,
299+
},
300+
{
301+
name: "network packet loss same type",
302+
faultType1: types.PacketLossFaultType,
303+
faultType2: types.PacketLossFaultType,
304+
body1: getNetworkPacketLossRequestBody(50, []string{"1.1.1.1", "2.2.2.2"}, []string{"8.8.8.8"}),
305+
body2: getNetworkPacketLossRequestBody(70, []string{"3.3.3.3", "4.4.4.4"}, []string{"9.9.9.9"}),
306+
responseCode1: http.StatusOK,
307+
responseCode2: http.StatusConflict,
308+
},
309+
{
310+
name: "network latency and packet loss different type",
311+
faultType1: types.LatencyFaultType,
312+
faultType2: types.PacketLossFaultType,
313+
body1: getNetworkLatencyRequestBody(100, 200, []string{"1.1.1.1", "2.2.2.2"}, []string{"8.8.8.8"}),
314+
body2: getNetworkPacketLossRequestBody(70, []string{"3.3.3.3", "4.4.4.4"}, []string{"9.9.9.9"}),
315+
responseCode1: http.StatusOK,
316+
responseCode2: http.StatusConflict,
317+
},
318+
{
319+
name: "network black hole port and latency different type",
320+
faultType1: types.BlackHolePortFaultType,
321+
faultType2: types.LatencyFaultType,
322+
body1: getNetworkBlackHolePortRequestBody(4321, "tcp", "ingress"),
323+
body2: getNetworkLatencyRequestBody(100, 200, []string{"1.1.1.1", "2.2.2.2"}, []string{"8.8.8.8"}),
324+
responseCode1: http.StatusOK,
325+
responseCode2: http.StatusOK,
326+
},
327+
{
328+
name: "network black hole port and packet loss different type",
329+
faultType1: types.BlackHolePortFaultType,
330+
faultType2: types.PacketLossFaultType,
331+
body1: getNetworkBlackHolePortRequestBody(4321, "tcp", "ingress"),
332+
body2: getNetworkPacketLossRequestBody(50, []string{"1.1.1.1", "2.2.2.2"}, []string{"8.8.8.8"}),
333+
responseCode1: http.StatusOK,
334+
responseCode2: http.StatusOK,
335+
},
336+
}
337+
skipForUnsupportedTc(t)
338+
for _, tc := range tcs {
339+
t.Run(tc.name, func(t *testing.T) {
340+
server, port := startServer(t)
341+
defer shutdownServer(t, server)
342+
// Cleaning up the fault injections afterwards
343+
defer func() {
344+
t.Logf("Cleaning up fault injections...")
345+
cleanupLatencyAndPacketLossFaults(t, port)
346+
cleanupBlackHolePortFault(t, port, tc.body1)
347+
// Sleeping for more than 5 seconds since we're only accepting 1 request per 5 seconds for each requests at a time
348+
time.Sleep(6 * time.Second)
349+
cleanupBlackHolePortFault(t, port, tc.body2)
350+
}()
351+
352+
client := &http.Client{}
353+
354+
// Making a start network fault request for the first fault type
355+
bodyBytes, _ := json.Marshal(tc.body1)
356+
bodyReader := bytes.NewReader(bodyBytes)
357+
req1, err := http.NewRequest(http.MethodPost, getURL(port, fmt.Sprintf(startEndpoint, endpointId, tc.faultType1)), bodyReader)
358+
require.NoError(t, err)
359+
res1, err := client.Do(req1)
360+
require.NoError(t, err)
361+
362+
// Sleeping for more than 5 seconds since we're only accepting 1 request per 5 seconds for each requests at a time
363+
time.Sleep(6 * time.Second)
364+
365+
// Making a start network fault request for the second fault type
366+
bodyBytes, _ = json.Marshal(tc.body2)
367+
bodyReader = bytes.NewReader(bodyBytes)
368+
req2, err := http.NewRequest(http.MethodPost, getURL(port, fmt.Sprintf(startEndpoint, endpointId, tc.faultType2)), bodyReader)
369+
require.NoError(t, err)
370+
res2, err := client.Do(req2)
371+
require.NoError(t, err)
372+
373+
assert.Equal(t, tc.responseCode1, res1.StatusCode)
374+
assert.Equal(t, tc.responseCode2, res2.StatusCode)
375+
})
376+
}
377+
}

0 commit comments

Comments
 (0)