From 03f62f8165cff5c71379eb628efecce8c53ccf3c Mon Sep 17 00:00:00 2001 From: Quan Tian Date: Mon, 27 Apr 2020 21:01:49 +0800 Subject: [PATCH] Start ovs-vswitchd with flow-restore-wait This patch starts ovs-vswitchd with flow-restore-wait set to true and removes the config after restoring necessary flows for the following reasons: 1. It prevents packets from being mishandled by ovs-vswitchd in its default fashion, which could affect existing connections' conntrack state and cause issues like #625. 2. It prevents ovs-vswitchd from flushing or expiring previously set datapath flows, so existing connections can achieve 0 downtime during OVS restart. As a result, we remove the config here after restoring necessary flows. --- build/images/scripts/start_ovs | 22 ++++++++- build/images/scripts/start_ovs_netdev | 22 ++++++++- cmd/antrea-agent/agent.go | 11 +++++ pkg/agent/agent.go | 5 ++ pkg/ovs/ovsconfig/interfaces.go | 3 ++ pkg/ovs/ovsconfig/ovs_client.go | 55 +++++++++++++++++++++ pkg/ovs/ovsconfig/testing/mock_ovsconfig.go | 45 ++++++++++++++++- test/integration/ovs/ovs_client_test.go | 21 ++++++++ 8 files changed, 179 insertions(+), 5 deletions(-) diff --git a/build/images/scripts/start_ovs b/build/images/scripts/start_ovs index 0727aacb756..3ef5f36aaa5 100755 --- a/build/images/scripts/start_ovs +++ b/build/images/scripts/start_ovs @@ -2,6 +2,7 @@ source logging source daemon_status +source /usr/share/openvswitch/scripts/ovs-lib CONTAINER_NAME="antrea-ovs" OVS_DB_FILE="/var/run/openvswitch/conf.db" @@ -9,8 +10,25 @@ OVS_DB_FILE="/var/run/openvswitch/conf.db" set -euo pipefail function start_ovs { - log_info $CONTAINER_NAME "Starting OVS" - /usr/share/openvswitch/scripts/ovs-ctl --system-id=random start --db-file=$OVS_DB_FILE + if daemon_is_running ovsdb-server; then + log_info $CONTAINER_NAME "ovsdb-server is already running" + else + log_info $CONTAINER_NAME "Starting ovsdb-server" + /usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd --system-id=random start --db-file=$OVS_DB_FILE + log_info $CONTAINER_NAME "Started ovsdb-server" + fi + + if daemon_is_running ovs-switchd; then + log_info $CONTAINER_NAME "ovs-switchd is already running" + else + log_info $CONTAINER_NAME "Starting ovs-switchd" + # Start ovs-vswitchd with flow-restore-wait set to true so that packets won't be + # mishandled in its default fashion, the config will be removed after antrea-agent + # restoring flows. + ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="true" + /usr/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --system-id=random start --db-file=$OVS_DB_FILE + log_info $CONTAINER_NAME "Started ovs-switchd" + fi } function stop_ovs { diff --git a/build/images/scripts/start_ovs_netdev b/build/images/scripts/start_ovs_netdev index 787f9d30016..78fd5870631 100755 --- a/build/images/scripts/start_ovs_netdev +++ b/build/images/scripts/start_ovs_netdev @@ -2,6 +2,7 @@ source logging source daemon_status +source /usr/share/openvswitch/scripts/ovs-lib CONTAINER_NAME="antrea-ovs" OVS_DB_FILE="/var/run/openvswitch/conf.db" @@ -47,8 +48,25 @@ function del_br_phy { } function start_ovs { - log_info $CONTAINER_NAME "Starting OVS" - /usr/share/openvswitch/scripts/ovs-ctl --system-id=random start --db-file=$OVS_DB_FILE + if daemon_is_running ovsdb-server; then + log_info $CONTAINER_NAME "ovsdb-server is already running" + else + log_info $CONTAINER_NAME "Starting ovsdb-server" + /usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd --system-id=random start --db-file=$OVS_DB_FILE + log_info $CONTAINER_NAME "Started ovsdb-server" + fi + + if daemon_is_running ovs-switchd; then + log_info $CONTAINER_NAME "ovs-switchd is already running" + else + log_info $CONTAINER_NAME "Starting ovs-switchd" + # Start ovs-vswitchd with flow-restore-wait set to true so that packets won't be + # mishandled in its default fashion, the config will be removed after antrea-agent + # restoring flows. + ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="true" + /usr/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --system-id=random start --db-file=$OVS_DB_FILE + log_info $CONTAINER_NAME "Started ovs-switchd" + fi } function stop_ovs { diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go index 685bd388afe..d08ace04e03 100644 --- a/cmd/antrea-agent/agent.go +++ b/cmd/antrea-agent/agent.go @@ -139,6 +139,17 @@ func run(o *Options) error { return fmt.Errorf("error initializing CNI server: %v", err) } + // ovs-vswitchd is started with flow-restore-wait set to true for the following reasons: + // 1. It prevents packets from being mishandled by ovs-vswitchd in its default fashion, + // which could affect existing connections' conntrack state and cause issues like #625. + // 2. It prevents ovs-vswitchd from flushing or expiring previously set datapath flows, + // so existing connections can achieve 0 downtime during OVS restart. + // As a result, we remove the config here after restoring necessary flows. + klog.Info("Cleaning up flow-restore-wait config") + if err := ovsBridgeClient.DeleteOVSOtherConfig(map[string]interface{}{"flow-restore-wait": "true"}); err != nil { + return fmt.Errorf("error when cleaning up flow-restore-wait config") + } + // set up signal capture: the first SIGTERM / SIGINT signal is handled gracefully and will // cause the stopCh channel to be closed; if another signal is received before the program // exits, we will force exit. diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 54454608358..b2c8fcc90bd 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -291,6 +291,11 @@ func (i *Initializer) initOpenFlowPipeline() error { klog.Info("Replaying OF flows to OVS bridge") i.ofClient.ReplayFlows() klog.Info("Flow replay completed") + + klog.Info("Cleaning up flow-restore-wait config") + if err := i.ovsBridgeClient.DeleteOVSOtherConfig(map[string]interface{}{"flow-restore-wait": "true"}); err != nil { + klog.Errorf("Error when cleaning up flow-restore-wait config") + } } }() diff --git a/pkg/ovs/ovsconfig/interfaces.go b/pkg/ovs/ovsconfig/interfaces.go index 5e0aa8f2a40..83909ff787d 100644 --- a/pkg/ovs/ovsconfig/interfaces.go +++ b/pkg/ovs/ovsconfig/interfaces.go @@ -42,4 +42,7 @@ type OVSBridgeClient interface { GetPortList() ([]OVSPortData, Error) SetInterfaceMTU(name string, MTU int) error GetOVSVersion() (string, Error) + AddOVSOtherConfig(configs map[string]interface{}) Error + GetOVSOtherConfig() (map[string]string, Error) + DeleteOVSOtherConfig(configs map[string]interface{}) Error } diff --git a/pkg/ovs/ovsconfig/ovs_client.go b/pkg/ovs/ovsconfig/ovs_client.go index 61b6767b6ff..edd75c03730 100644 --- a/pkg/ovs/ovsconfig/ovs_client.go +++ b/pkg/ovs/ovsconfig/ovs_client.go @@ -673,3 +673,58 @@ func (br *OVSBridge) GetOVSVersion() (string, Error) { return res[0].Rows[0].(map[string]interface{})["ovs_version"].(string), nil } + +func (br *OVSBridge) AddOVSOtherConfig(configs map[string]interface{}) Error { + tx := br.ovsdb.Transaction(openvSwitchSchema) + + mutateSet := helpers.MakeOVSDBMap(configs) + tx.Mutate(dbtransaction.Mutate{ + Table: "Open_vSwitch", + Mutations: [][]interface{}{{"other_config", "insert", mutateSet}}, + }) + + _, err, temporary := tx.Commit() + if err != nil { + klog.Error("Transaction failed: ", err) + return NewTransactionError(err, temporary) + } + return nil +} + +func (br *OVSBridge) GetOVSOtherConfig() (map[string]string, Error) { + tx := br.ovsdb.Transaction(openvSwitchSchema) + + tx.Select(dbtransaction.Select{ + Table: "Open_vSwitch", + Columns: []string{"other_config"}, + }) + + res, err, temporary := tx.Commit() + if err != nil { + klog.Error("Transaction failed: ", err) + return nil, NewTransactionError(err, temporary) + } + if len(res[0].Rows) == 0 { + klog.Warning("Could not find other_config") + return nil, nil + } + otherConfigs := res[0].Rows[0].(map[string]interface{})["other_config"].([]interface{}) + return buildMapFromOVSDBMap(otherConfigs), nil +} + +func (br *OVSBridge) DeleteOVSOtherConfig(configs map[string]interface{}) Error { + tx := br.ovsdb.Transaction(openvSwitchSchema) + + mutateSet := helpers.MakeOVSDBMap(configs) + tx.Mutate(dbtransaction.Mutate{ + Table: "Open_vSwitch", + Mutations: [][]interface{}{{"other_config", "delete", mutateSet}}, + }) + + _, err, temporary := tx.Commit() + if err != nil { + klog.Error("Transaction failed: ", err) + return NewTransactionError(err, temporary) + } + return nil +} diff --git a/pkg/ovs/ovsconfig/testing/mock_ovsconfig.go b/pkg/ovs/ovsconfig/testing/mock_ovsconfig.go index cd52701460e..785f8eeb754 100644 --- a/pkg/ovs/ovsconfig/testing/mock_ovsconfig.go +++ b/pkg/ovs/ovsconfig/testing/mock_ovsconfig.go @@ -1,4 +1,4 @@ -// Copyright 2019 Antrea Authors +// Copyright 2020 Antrea Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,6 +48,20 @@ func (m *MockOVSBridgeClient) EXPECT() *MockOVSBridgeClientMockRecorder { return m.recorder } +// AddOVSOtherConfig mocks base method +func (m *MockOVSBridgeClient) AddOVSOtherConfig(arg0 map[string]interface{}) ovsconfig.Error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AddOVSOtherConfig", arg0) + ret0, _ := ret[0].(ovsconfig.Error) + return ret0 +} + +// AddOVSOtherConfig indicates an expected call of AddOVSOtherConfig +func (mr *MockOVSBridgeClientMockRecorder) AddOVSOtherConfig(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddOVSOtherConfig", reflect.TypeOf((*MockOVSBridgeClient)(nil).AddOVSOtherConfig), arg0) +} + // Create mocks base method func (m *MockOVSBridgeClient) Create() ovsconfig.Error { m.ctrl.T.Helper() @@ -136,6 +150,20 @@ func (mr *MockOVSBridgeClientMockRecorder) Delete() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Delete", reflect.TypeOf((*MockOVSBridgeClient)(nil).Delete)) } +// DeleteOVSOtherConfig mocks base method +func (m *MockOVSBridgeClient) DeleteOVSOtherConfig(arg0 map[string]interface{}) ovsconfig.Error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteOVSOtherConfig", arg0) + ret0, _ := ret[0].(ovsconfig.Error) + return ret0 +} + +// DeleteOVSOtherConfig indicates an expected call of DeleteOVSOtherConfig +func (mr *MockOVSBridgeClientMockRecorder) DeleteOVSOtherConfig(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteOVSOtherConfig", reflect.TypeOf((*MockOVSBridgeClient)(nil).DeleteOVSOtherConfig), arg0) +} + // DeletePort mocks base method func (m *MockOVSBridgeClient) DeletePort(arg0 string) ovsconfig.Error { m.ctrl.T.Helper() @@ -194,6 +222,21 @@ func (mr *MockOVSBridgeClientMockRecorder) GetOFPort(arg0 interface{}) *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetOFPort", reflect.TypeOf((*MockOVSBridgeClient)(nil).GetOFPort), arg0) } +// GetOVSOtherConfig mocks base method +func (m *MockOVSBridgeClient) GetOVSOtherConfig() (map[string]string, ovsconfig.Error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetOVSOtherConfig") + ret0, _ := ret[0].(map[string]string) + ret1, _ := ret[1].(ovsconfig.Error) + return ret0, ret1 +} + +// GetOVSOtherConfig indicates an expected call of GetOVSOtherConfig +func (mr *MockOVSBridgeClientMockRecorder) GetOVSOtherConfig() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetOVSOtherConfig", reflect.TypeOf((*MockOVSBridgeClient)(nil).GetOVSOtherConfig)) +} + // GetOVSVersion mocks base method func (m *MockOVSBridgeClient) GetOVSVersion() (string, ovsconfig.Error) { m.ctrl.T.Helper() diff --git a/test/integration/ovs/ovs_client_test.go b/test/integration/ovs/ovs_client_test.go index 5d3cac0f90e..2a48aec6b19 100644 --- a/test/integration/ovs/ovs_client_test.go +++ b/test/integration/ovs/ovs_client_test.go @@ -150,6 +150,27 @@ func TestOVSBridgeExternalIDs(t *testing.T) { } } +func TestOVSOtherConfig(t *testing.T) { + data := &testData{} + data.setup(t) + defer data.teardown(t) + + otherConfigs := map[string]interface{}{"flow-restore-wait": "true"} + err := data.br.AddOVSOtherConfig(otherConfigs) + require.Nil(t, err, "Error when adding OVS other_config") + + gotOtherConfigs, err := data.br.GetOVSOtherConfig() + require.Nil(t, err, "Error when getting OVS other_config") + require.Equal(t, map[string]string{"flow-restore-wait": "true"}, gotOtherConfigs, "other_config mismatched") + + err = data.br.DeleteOVSOtherConfig(otherConfigs) + require.Nil(t, err, "Error when deleting OVS other_config") + + gotOtherConfigs, err = data.br.GetOVSOtherConfig() + require.Nil(t, err, "Error when getting OVS other_config") + require.Equal(t, map[string]string{}, gotOtherConfigs, "other_config mismatched") +} + func deleteAllPorts(t *testing.T, br *ovsconfig.OVSBridge) { portList, err := br.GetPortUUIDList() require.Nil(t, err, "Error when retrieving port list")