23726258 SMF enable-disable with many nws puts neutron-dhcp-agent into maintenance
23855850 Neutron agents (L3 and DHCP) should cleanup pre-existing resources when starting
23855912 neutron-l3-agent should make sure contract is empty in SMF stop method
--- a/components/openstack/common/files/openstack_common.py Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/common/files/openstack_common.py Tue Jul 12 11:18:58 2016 -0700
@@ -29,7 +29,7 @@
import json
import os
import shutil
-from subprocess import Popen, PIPE
+from subprocess import CalledProcessError, Popen, PIPE, check_call
import time
import uuid
@@ -290,3 +290,27 @@
except NoOptionError:
return False
return "ml2" in core_plugin.lower()
+
+
+def kill_contract(attempts, interval, ctid):
+ """ Keeps issuing SIGTERM to contract-id at specified intervals until
+ either the contract is empty or the specified number of attempts are made.
+ Returns 0 if pkill failed, 1 if contract was successfully emptied and 2
+ if attempts were exhausted before the contract could be emptied.
+ """
+ for _ in xrange(attempts):
+ # Kill the SMF contract
+ try:
+ check_call(["/usr/bin/pkill", "-c", ctid])
+ except CalledProcessError as err:
+ print "failed to kill the SMF contract: %s" % err
+ return 0
+ time.sleep(interval)
+ try:
+ # check if contract is empty
+ check_call(["/usr/bin/pgrep", "-c", ctid], stdout=PIPE,
+ stderr=PIPE)
+ except:
+ # contract is empty
+ return 1
+ return 2
--- a/components/openstack/neutron/files/neutron-dhcp-agent Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-dhcp-agent Tue Jul 12 11:18:58 2016 -0700
@@ -19,7 +19,7 @@
import re
import sys
-from openstack_common import is_ml2_plugin
+from openstack_common import is_ml2_plugin, kill_contract
import smf_include
from subprocess import CalledProcessError, Popen, PIPE, check_call
@@ -45,6 +45,48 @@
return True
+def cleanup_dhcp_agent_datalinks():
+ cmd = ["/usr/sbin/dladm", "show-link", "-p", "-o", "link"]
+ p = Popen(cmd, stdout=PIPE, stderr=PIPE)
+ output, error = p.communicate()
+ if p.returncode != 0:
+ print "failed to retrieve datalink names"
+ return smf_include.SMF_EXIT_ERR_FATAL
+
+ dlnames = output.splitlines()
+ # DHCP agent datalinks are always 15 characters in length. They start with
+ # 'dh', end with '_0', and in between they are hexadecimal digits.
+ prog = re.compile('dh[0-9A-Fa-f\_]{11}_0')
+ ret_code = smf_include.SMF_EXIT_OK
+ ovs_bridge = None
+ if is_ml2_plugin():
+ ovs_bridge = get_ovs_bridge()
+ for dlname in dlnames:
+ if prog.search(dlname) is None:
+ continue
+ try:
+ # first remove the IP
+ check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
+ dlname])
+ except:
+ # It is possible that the IP was already deleted but not the
+ # datalink. So we continue and try and delete the datalink.
+ pass
+ try:
+ # next remove the VNIC
+ check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
+ dlname])
+ # remove the OVS Port
+ if ovs_bridge:
+ check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
+ "--if-exists", "del-port", ovs_bridge, dlname])
+ except CalledProcessError as err:
+ print "failed to remove datalink '%s' used by DHCP agent: %s" % \
+ (dlname, err)
+ ret_code = smf_include.SMF_EXIT_ERR_FATAL
+ return ret_code
+
+
def start():
# verify paths are valid
for f in sys.argv[2:4]:
@@ -52,6 +94,11 @@
print '%s does not exist or is not readable' % f
return smf_include.SMF_EXIT_ERR_CONFIG
+ # remove VNICs associated with DHCP agent if any were left over.
+ ret_code = cleanup_dhcp_agent_datalinks()
+ if ret_code != smf_include.SMF_EXIT_OK:
+ return ret_code
+
# set the hostmodel property if necessary
if not set_hostmodel("src-priority"):
return smf_include.SMF_EXIT_ERR_FATAL
@@ -72,47 +119,22 @@
def stop():
- try:
- # first kill the SMF contract
- check_call(["/usr/bin/pkill", "-c", sys.argv[2]])
- except CalledProcessError as err:
- print "failed to kill the SMF contract: %s" % err
- return smf_include.SMF_EXIT_ERR_FATAL
-
- cmd = ["/usr/sbin/ipadm", "show-if", "-p", "-o", "ifname"]
- p = Popen(cmd, stdout=PIPE, stderr=PIPE)
- output, error = p.communicate()
- if p.returncode != 0:
- print "failed to retrieve IP interface names"
+ # Keep issuing SIGTERM until the contract is empty. This way we will catch
+ # any child processes missed because they were getting forked.
+ # 50 attempts will be made at intervals of 2 seconds. Typically, we
+ # will only need 0 or 1 additional attempt before the contract is empty but
+ # we chose to err on the side of caution. In the worst case, we will use
+ # 100 seconds in the below loop which will leave 500 seconds (timeout is
+ # 600s) for the other cleanup tasks, after which the service will be put to
+ # maintenance state if the contract was not killed successfully.
+ if not kill_contract(50, 2, sys.argv[2]):
return smf_include.SMF_EXIT_ERR_FATAL
- ifnames = output.splitlines()
- # DHCP agent datalinks are always 15 characters in length. They start with
- # 'dh', end with '_0', and in between they are hexadecimal digits.
- prog = re.compile('dh[0-9A-Fa-f\_]{11}_0')
- err_delete = False
- for ifname in ifnames:
- if prog.search(ifname) is None:
- continue
- try:
- # first remove the IP
- check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
- ifname])
- # next remove the VNIC
- check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
- ifname])
- # remove the OVS Port
- if is_ml2_plugin():
- check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
- "--if-exists", "del-port", get_ovs_bridge(),
- ifname])
- except CalledProcessError as err:
- print "failed to remove datalink '%s' used by DHCP agent: %s" % \
- (ifname, err)
- err_delete = True
+ # remove VNICs associated with DHCP agent
+ ret_code = cleanup_dhcp_agent_datalinks()
# finally reset the hostmodel property
- if not set_hostmodel("weak") or err_delete:
+ if not set_hostmodel("weak") or ret_code != smf_include.SMF_EXIT_OK:
return smf_include.SMF_EXIT_ERR_FATAL
return smf_include.SMF_EXIT_OK
--- a/components/openstack/neutron/files/neutron-dhcp-agent.xml Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-dhcp-agent.xml Tue Jul 12 11:18:58 2016 -0700
@@ -85,7 +85,7 @@
<logfile_attributes permissions='600'/>
- <exec_method timeout_seconds="60" type="method" name="start"
+ <exec_method timeout_seconds="600" type="method" name="start"
exec="/lib/svc/method/neutron-dhcp-agent %m %{config/config_path} %{config/dhcp_config_path}">
<method_context>
<method_credential user='neutron' group='neutron'
--- a/components/openstack/neutron/files/neutron-l3-agent Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-l3-agent Tue Jul 12 11:18:58 2016 -0700
@@ -21,7 +21,7 @@
import sys
import netaddr
-from openstack_common import is_ml2_plugin
+from openstack_common import is_ml2_plugin, kill_contract
import smf_include
from neutron.agent.solaris import packetfilter
@@ -51,6 +51,49 @@
return True
+def cleanup_l3_agent_datalinks():
+ cmd = ["/usr/sbin/dladm", "show-link", "-p", "-o", "link"]
+ p = Popen(cmd, stdout=PIPE, stderr=PIPE)
+ output, error = p.communicate()
+ if p.returncode != 0:
+ print "failed to retrieve datalink names"
+ return smf_include.SMF_EXIT_ERR_FATAL
+
+ dlnames = output.splitlines()
+ # L3 agent datalinks are always 15 characters in length. They start
+ # with either 'l3i' or 'l3e', end with '_0', and in between they are
+ # hexadecimal digits.
+ prog = re.compile('l3[ie][0-9A-Fa-f\_]{10}_0')
+ retcode = smf_include.SMF_EXIT_OK
+ is_ml2 = is_ml2_plugin()
+ for dlname in dlnames:
+ if prog.search(dlname) is None:
+ continue
+ try:
+ # first remove the IP
+ check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
+ dlname])
+ except:
+ # It is possible that the IP was already deleted but not the
+ # datalink. So we continue and try and delete the datalink.
+ pass
+ try:
+ # next remove the VNIC
+ check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
+ dlname])
+ # remove the OVS Port
+ if is_ml2:
+ ovs_bridge = get_ovs_bridge(dlname)
+ if ovs_bridge:
+ check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
+ "--if-exists", "del-port", ovs_bridge, dlname])
+ except CalledProcessError as err:
+ print "failed to remove datalink '%s' used by L3 agent: %s" % \
+ (dlname, err)
+ retcode = smf_include.SMF_EXIT_ERR_FATAL
+ return retcode
+
+
def start():
# verify paths are valid
for f in sys.argv[2:6]:
@@ -58,6 +101,16 @@
print '%s does not exist or is not readable' % f
return smf_include.SMF_EXIT_ERR_CONFIG
+ # We need to remove VNICs associated with L3 agent if any were left over.
+ # Before that, we need to first remove the PF rules added under
+ # _auto/neutron:l3:agent anchor and then remove the IP interfaces on which
+ # the rules were applied.
+ pf = packetfilter.PacketFilter('_auto/neutron:l3:agent')
+ pf.remove_anchor_recursively()
+ ret_code = cleanup_l3_agent_datalinks()
+ if ret_code != smf_include.SMF_EXIT_OK:
+ return ret_code
+
# System-wide forwarding (either ipv4 or ipv6 or both) must be enabled
# before neutron-l3-agent can be started.
cmd = ["/usr/sbin/ipadm", "show-prop", "-c", "-p", "forwarding",
@@ -117,9 +170,9 @@
return smf_include.smf_subprocess(cmd)
-def get_ovs_bridge(ifname):
+def get_ovs_bridge(dlname):
# retrieve the right OVS bridge based on the interface name
- if ifname.startswith('l3i'):
+ if dlname.startswith('l3i'):
config_file = '/etc/neutron/plugins/openvswitch/ovs_neutron_plugin.ini'
section = "ovs"
option = "integration_bridge"
@@ -138,11 +191,16 @@
def stop():
shutdown_vpn()
- try:
- # first kill the SMF contract
- check_call(["/usr/bin/pkill", "-c", sys.argv[2]])
- except CalledProcessError as err:
- print "failed to kill the SMF contract: %s" % (err)
+ # Keep issuing SIGTERM until the contract is empty. This way we will catch
+ # any child processes missed because they were getting forked.
+ # 50 attempts will be made at intervals of 2 seconds. Typically, we
+ # will only need 0 or 1 additional attempt before the contract is empty but
+ # we chose to err on the side of caution. In the worst case, we will use
+ # 100 seconds in the below loop which will leave 500 seconds (timeout is
+ # 600s) for the other cleanup tasks, after which the service will be put to
+ # maintenance state if the contract was not killed successfully.
+ if not kill_contract(50, 2, sys.argv[2]):
+ return smf_include.SMF_EXIT_ERR_FATAL
# We need to first remove the PF rules added under _auto/neutron:l3:agent
# anchor and then remove the IP interfaces on which the rules were applied.
@@ -150,42 +208,10 @@
pf.remove_anchor_recursively()
# remove VNICs associated with L3 agent
- cmd = ["/usr/sbin/ipadm", "show-if", "-p", "-o", "ifname"]
- p = Popen(cmd, stdout=PIPE, stderr=PIPE)
- output, error = p.communicate()
- if p.returncode != 0:
- print "failed to retrieve IP interface names"
- return smf_include.SMF_EXIT_ERR_CONFIG
-
- ifnames = output.splitlines()
- # L3 agent datalinks are always 15 characters in length. They start
- # with either 'l3i' or 'l3e', end with '_0', and in between they are
- # hexadecimal digits.
- prog = re.compile('l3[ie][0-9A-Fa-f\_]{10}_0')
- err_delete = False
- for ifname in ifnames:
- if prog.search(ifname) is None:
- continue
- try:
- # first remove the IP
- check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
- ifname])
- # next remove the VNIC
- check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
- ifname])
- # remove the OVS Port
- if is_ml2_plugin():
- ovs_bridge = get_ovs_bridge(ifname)
- if ovs_bridge:
- check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
- "--if-exists", "del-port", ovs_bridge, ifname])
- except CalledProcessError as err:
- print "failed to remove datalink '%s' used by L3 agent: %s" % \
- (ifname, err)
- err_delete = True
+ ret_code = cleanup_l3_agent_datalinks()
# finally reset the hostmodel property
- if not set_hostmodel("weak") or err_delete:
+ if not set_hostmodel("weak") or ret_code != smf_include.SMF_EXIT_OK:
return smf_include.SMF_EXIT_ERR_FATAL
return smf_include.SMF_EXIT_OK
--- a/components/openstack/neutron/files/neutron-l3-agent.xml Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-l3-agent.xml Tue Jul 12 11:18:58 2016 -0700
@@ -63,7 +63,7 @@
<logfile_attributes permissions='600'/>
- <exec_method timeout_seconds="60" type="method" name="start"
+ <exec_method timeout_seconds="600" type="method" name="start"
exec="/lib/svc/method/neutron-l3-agent %m %{config/config_path} %{config/l3_config_path} %{config/vpn_config_path} %{config/ml2_config_path}">
<method_context>
<method_credential user='neutron' group='neutron'/>