Skip to content

Commit

Permalink
Start ovs-vswitchd with flow-restore-wait
Browse files Browse the repository at this point in the history
This patch starts ovs-vswitchd with flow-restore-wait set to true and
removes the config after restoring necessary flows for the following
reasons:

1. It prevents packets from being mishandled by ovs-vswitchd in its
default fashion, which could affect existing connections' conntrack
state and cause issues like antrea-io#625.

2. It prevents ovs-vswitchd from flushing or expiring previously set
datapath flows, so existing connections can achieve 0 downtime during
OVS restart. As a result, we remove the config here after restoring
necessary flows.
  • Loading branch information
tnqn committed Apr 27, 2020
1 parent 7ebacb8 commit 03f62f8
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 5 deletions.
22 changes: 20 additions & 2 deletions build/images/scripts/start_ovs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,33 @@

source logging
source daemon_status
source /usr/share/openvswitch/scripts/ovs-lib

CONTAINER_NAME="antrea-ovs"
OVS_DB_FILE="/var/run/openvswitch/conf.db"

set -euo pipefail

function start_ovs {
log_info $CONTAINER_NAME "Starting OVS"
/usr/share/openvswitch/scripts/ovs-ctl --system-id=random start --db-file=$OVS_DB_FILE
if daemon_is_running ovsdb-server; then
log_info $CONTAINER_NAME "ovsdb-server is already running"
else
log_info $CONTAINER_NAME "Starting ovsdb-server"
/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd --system-id=random start --db-file=$OVS_DB_FILE
log_info $CONTAINER_NAME "Started ovsdb-server"
fi

if daemon_is_running ovs-switchd; then
log_info $CONTAINER_NAME "ovs-switchd is already running"
else
log_info $CONTAINER_NAME "Starting ovs-switchd"
# Start ovs-vswitchd with flow-restore-wait set to true so that packets won't be
# mishandled in its default fashion, the config will be removed after antrea-agent
# restoring flows.
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="true"
/usr/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --system-id=random start --db-file=$OVS_DB_FILE
log_info $CONTAINER_NAME "Started ovs-switchd"
fi
}

function stop_ovs {
Expand Down
22 changes: 20 additions & 2 deletions build/images/scripts/start_ovs_netdev
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

source logging
source daemon_status
source /usr/share/openvswitch/scripts/ovs-lib

CONTAINER_NAME="antrea-ovs"
OVS_DB_FILE="/var/run/openvswitch/conf.db"
Expand Down Expand Up @@ -47,8 +48,25 @@ function del_br_phy {
}

function start_ovs {
log_info $CONTAINER_NAME "Starting OVS"
/usr/share/openvswitch/scripts/ovs-ctl --system-id=random start --db-file=$OVS_DB_FILE
if daemon_is_running ovsdb-server; then
log_info $CONTAINER_NAME "ovsdb-server is already running"
else
log_info $CONTAINER_NAME "Starting ovsdb-server"
/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd --system-id=random start --db-file=$OVS_DB_FILE
log_info $CONTAINER_NAME "Started ovsdb-server"
fi

if daemon_is_running ovs-switchd; then
log_info $CONTAINER_NAME "ovs-switchd is already running"
else
log_info $CONTAINER_NAME "Starting ovs-switchd"
# Start ovs-vswitchd with flow-restore-wait set to true so that packets won't be
# mishandled in its default fashion, the config will be removed after antrea-agent
# restoring flows.
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="true"
/usr/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --system-id=random start --db-file=$OVS_DB_FILE
log_info $CONTAINER_NAME "Started ovs-switchd"
fi
}

function stop_ovs {
Expand Down
11 changes: 11 additions & 0 deletions cmd/antrea-agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,17 @@ func run(o *Options) error {
return fmt.Errorf("error initializing CNI server: %v", err)
}

// ovs-vswitchd is started with flow-restore-wait set to true for the following reasons:
// 1. It prevents packets from being mishandled by ovs-vswitchd in its default fashion,
// which could affect existing connections' conntrack state and cause issues like #625.
// 2. It prevents ovs-vswitchd from flushing or expiring previously set datapath flows,
// so existing connections can achieve 0 downtime during OVS restart.
// As a result, we remove the config here after restoring necessary flows.
klog.Info("Cleaning up flow-restore-wait config")
if err := ovsBridgeClient.DeleteOVSOtherConfig(map[string]interface{}{"flow-restore-wait": "true"}); err != nil {
return fmt.Errorf("error when cleaning up flow-restore-wait config")
}

// set up signal capture: the first SIGTERM / SIGINT signal is handled gracefully and will
// cause the stopCh channel to be closed; if another signal is received before the program
// exits, we will force exit.
Expand Down
5 changes: 5 additions & 0 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,11 @@ func (i *Initializer) initOpenFlowPipeline() error {
klog.Info("Replaying OF flows to OVS bridge")
i.ofClient.ReplayFlows()
klog.Info("Flow replay completed")

klog.Info("Cleaning up flow-restore-wait config")
if err := i.ovsBridgeClient.DeleteOVSOtherConfig(map[string]interface{}{"flow-restore-wait": "true"}); err != nil {
klog.Errorf("Error when cleaning up flow-restore-wait config")
}
}
}()

Expand Down
3 changes: 3 additions & 0 deletions pkg/ovs/ovsconfig/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,7 @@ type OVSBridgeClient interface {
GetPortList() ([]OVSPortData, Error)
SetInterfaceMTU(name string, MTU int) error
GetOVSVersion() (string, Error)
AddOVSOtherConfig(configs map[string]interface{}) Error
GetOVSOtherConfig() (map[string]string, Error)
DeleteOVSOtherConfig(configs map[string]interface{}) Error
}
55 changes: 55 additions & 0 deletions pkg/ovs/ovsconfig/ovs_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,58 @@ func (br *OVSBridge) GetOVSVersion() (string, Error) {

return res[0].Rows[0].(map[string]interface{})["ovs_version"].(string), nil
}

func (br *OVSBridge) AddOVSOtherConfig(configs map[string]interface{}) Error {
tx := br.ovsdb.Transaction(openvSwitchSchema)

mutateSet := helpers.MakeOVSDBMap(configs)
tx.Mutate(dbtransaction.Mutate{
Table: "Open_vSwitch",
Mutations: [][]interface{}{{"other_config", "insert", mutateSet}},
})

_, err, temporary := tx.Commit()
if err != nil {
klog.Error("Transaction failed: ", err)
return NewTransactionError(err, temporary)
}
return nil
}

func (br *OVSBridge) GetOVSOtherConfig() (map[string]string, Error) {
tx := br.ovsdb.Transaction(openvSwitchSchema)

tx.Select(dbtransaction.Select{
Table: "Open_vSwitch",
Columns: []string{"other_config"},
})

res, err, temporary := tx.Commit()
if err != nil {
klog.Error("Transaction failed: ", err)
return nil, NewTransactionError(err, temporary)
}
if len(res[0].Rows) == 0 {
klog.Warning("Could not find other_config")
return nil, nil
}
otherConfigs := res[0].Rows[0].(map[string]interface{})["other_config"].([]interface{})
return buildMapFromOVSDBMap(otherConfigs), nil
}

func (br *OVSBridge) DeleteOVSOtherConfig(configs map[string]interface{}) Error {
tx := br.ovsdb.Transaction(openvSwitchSchema)

mutateSet := helpers.MakeOVSDBMap(configs)
tx.Mutate(dbtransaction.Mutate{
Table: "Open_vSwitch",
Mutations: [][]interface{}{{"other_config", "delete", mutateSet}},
})

_, err, temporary := tx.Commit()
if err != nil {
klog.Error("Transaction failed: ", err)
return NewTransactionError(err, temporary)
}
return nil
}
45 changes: 44 additions & 1 deletion pkg/ovs/ovsconfig/testing/mock_ovsconfig.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions test/integration/ovs/ovs_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,27 @@ func TestOVSBridgeExternalIDs(t *testing.T) {
}
}

func TestOVSOtherConfig(t *testing.T) {
data := &testData{}
data.setup(t)
defer data.teardown(t)

otherConfigs := map[string]interface{}{"flow-restore-wait": "true"}
err := data.br.AddOVSOtherConfig(otherConfigs)
require.Nil(t, err, "Error when adding OVS other_config")

gotOtherConfigs, err := data.br.GetOVSOtherConfig()
require.Nil(t, err, "Error when getting OVS other_config")
require.Equal(t, map[string]string{"flow-restore-wait": "true"}, gotOtherConfigs, "other_config mismatched")

err = data.br.DeleteOVSOtherConfig(otherConfigs)
require.Nil(t, err, "Error when deleting OVS other_config")

gotOtherConfigs, err = data.br.GetOVSOtherConfig()
require.Nil(t, err, "Error when getting OVS other_config")
require.Equal(t, map[string]string{}, gotOtherConfigs, "other_config mismatched")
}

func deleteAllPorts(t *testing.T, br *ovsconfig.OVSBridge) {
portList, err := br.GetPortUUIDList()
require.Nil(t, err, "Error when retrieving port list")
Expand Down

0 comments on commit 03f62f8

Please sign in to comment.