23726258 SMF enable-disable with many nws puts neutron-dhcp-agent into maintenance
authorchaithan.prakash@oracle.com <chaithan.prakash@oracle.com>
Tue, 12 Jul 2016 11:18:58 -0700
changeset 6382 ed601ca40b9c
parent 6381 02b02527288b
child 6384 f291f75e3c47
23726258 SMF enable-disable with many nws puts neutron-dhcp-agent into maintenance 23855850 Neutron agents (L3 and DHCP) should cleanup pre-existing resources when starting 23855912 neutron-l3-agent should make sure contract is empty in SMF stop method
components/openstack/common/files/openstack_common.py
components/openstack/neutron/files/neutron-dhcp-agent
components/openstack/neutron/files/neutron-dhcp-agent.xml
components/openstack/neutron/files/neutron-l3-agent
components/openstack/neutron/files/neutron-l3-agent.xml
--- a/components/openstack/common/files/openstack_common.py	Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/common/files/openstack_common.py	Tue Jul 12 11:18:58 2016 -0700
@@ -29,7 +29,7 @@
 import json
 import os
 import shutil
-from subprocess import Popen, PIPE
+from subprocess import CalledProcessError, Popen, PIPE, check_call
 import time
 import uuid
 
@@ -290,3 +290,27 @@
     except NoOptionError:
         return False
     return "ml2" in core_plugin.lower()
+
+
+def kill_contract(attempts, interval, ctid):
+    """ Keeps issuing SIGTERM to contract-id at specified intervals until
+    either the contract is empty or the specified number of attempts are made.
+    Returns 0 if pkill failed, 1 if contract was successfully emptied and 2
+    if attempts were exhausted before the contract could be emptied.
+    """
+    for _ in xrange(attempts):
+        # Kill the SMF contract
+        try:
+            check_call(["/usr/bin/pkill", "-c", ctid])
+        except CalledProcessError as err:
+            print "failed to kill the SMF contract: %s" % err
+            return 0
+        time.sleep(interval)
+        try:
+            # check if contract is empty
+            check_call(["/usr/bin/pgrep", "-c", ctid], stdout=PIPE,
+                       stderr=PIPE)
+        except:
+            # contract is empty
+            return 1
+    return 2
--- a/components/openstack/neutron/files/neutron-dhcp-agent	Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-dhcp-agent	Tue Jul 12 11:18:58 2016 -0700
@@ -19,7 +19,7 @@
 import re
 import sys
 
-from openstack_common import is_ml2_plugin
+from openstack_common import is_ml2_plugin, kill_contract
 import smf_include
 
 from subprocess import CalledProcessError, Popen, PIPE, check_call
@@ -45,6 +45,48 @@
     return True
 
 
+def cleanup_dhcp_agent_datalinks():
+    cmd = ["/usr/sbin/dladm", "show-link", "-p", "-o", "link"]
+    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
+    output, error = p.communicate()
+    if p.returncode != 0:
+        print "failed to retrieve datalink names"
+        return smf_include.SMF_EXIT_ERR_FATAL
+
+    dlnames = output.splitlines()
+    # DHCP agent datalinks are always 15 characters in length. They start with
+    # 'dh', end with '_0', and in between they are hexadecimal digits.
+    prog = re.compile('dh[0-9A-Fa-f\_]{11}_0')
+    ret_code = smf_include.SMF_EXIT_OK
+    ovs_bridge = None
+    if is_ml2_plugin():
+        ovs_bridge = get_ovs_bridge()
+    for dlname in dlnames:
+        if prog.search(dlname) is None:
+            continue
+        try:
+            # first remove the IP
+            check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
+                        dlname])
+        except:
+            # It is possible that the IP was already deleted but not the
+            # datalink. So we continue and try and delete the datalink.
+            pass
+        try:
+            # next remove the VNIC
+            check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
+                        dlname])
+            # remove the OVS Port
+            if ovs_bridge:
+                check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
+                            "--if-exists", "del-port", ovs_bridge, dlname])
+        except CalledProcessError as err:
+            print "failed to remove datalink '%s' used by DHCP agent: %s" % \
+                (dlname, err)
+            ret_code = smf_include.SMF_EXIT_ERR_FATAL
+    return ret_code
+
+
 def start():
     # verify paths are valid
     for f in sys.argv[2:4]:
@@ -52,6 +94,11 @@
             print '%s does not exist or is not readable' % f
             return smf_include.SMF_EXIT_ERR_CONFIG
 
+    # remove VNICs associated with DHCP agent if any were left over.
+    ret_code = cleanup_dhcp_agent_datalinks()
+    if ret_code != smf_include.SMF_EXIT_OK:
+        return ret_code
+
     # set the hostmodel property if necessary
     if not set_hostmodel("src-priority"):
         return smf_include.SMF_EXIT_ERR_FATAL
@@ -72,47 +119,22 @@
 
 
 def stop():
-    try:
-        # first kill the SMF contract
-        check_call(["/usr/bin/pkill", "-c", sys.argv[2]])
-    except CalledProcessError as err:
-        print "failed to kill the SMF contract: %s" % err
-        return smf_include.SMF_EXIT_ERR_FATAL
-
-    cmd = ["/usr/sbin/ipadm", "show-if", "-p", "-o", "ifname"]
-    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
-    output, error = p.communicate()
-    if p.returncode != 0:
-        print "failed to retrieve IP interface names"
+    # Keep issuing SIGTERM until the contract is empty. This way we will catch
+    # any child processes missed because they were getting forked.
+    # 50 attempts will be made at intervals of 2 seconds. Typically, we
+    # will only need 0 or 1 additional attempt before the contract is empty but
+    # we chose to err on the side of caution. In the worst case, we will use
+    # 100 seconds in the below loop which will leave 500 seconds (timeout is
+    # 600s) for the other cleanup tasks, after which the service will be put to
+    # maintenance state if the contract was not killed successfully.
+    if not kill_contract(50, 2, sys.argv[2]):
         return smf_include.SMF_EXIT_ERR_FATAL
 
-    ifnames = output.splitlines()
-    # DHCP agent datalinks are always 15 characters in length. They start with
-    # 'dh', end with '_0', and in between they are hexadecimal digits.
-    prog = re.compile('dh[0-9A-Fa-f\_]{11}_0')
-    err_delete = False
-    for ifname in ifnames:
-        if prog.search(ifname) is None:
-            continue
-        try:
-            # first remove the IP
-            check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
-                        ifname])
-            # next remove the VNIC
-            check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
-                        ifname])
-            # remove the OVS Port
-            if is_ml2_plugin():
-                check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
-                            "--if-exists", "del-port", get_ovs_bridge(),
-                            ifname])
-        except CalledProcessError as err:
-            print "failed to remove datalink '%s' used by DHCP agent: %s" % \
-                (ifname, err)
-            err_delete = True
+    # remove VNICs associated with DHCP agent
+    ret_code = cleanup_dhcp_agent_datalinks()
 
     # finally reset the hostmodel property
-    if not set_hostmodel("weak") or err_delete:
+    if not set_hostmodel("weak") or ret_code != smf_include.SMF_EXIT_OK:
         return smf_include.SMF_EXIT_ERR_FATAL
     return smf_include.SMF_EXIT_OK
 
--- a/components/openstack/neutron/files/neutron-dhcp-agent.xml	Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-dhcp-agent.xml	Tue Jul 12 11:18:58 2016 -0700
@@ -85,7 +85,7 @@
 
     <logfile_attributes permissions='600'/>
 
-    <exec_method timeout_seconds="60" type="method" name="start"
+    <exec_method timeout_seconds="600" type="method" name="start"
       exec="/lib/svc/method/neutron-dhcp-agent %m %{config/config_path} %{config/dhcp_config_path}">
       <method_context>
         <method_credential user='neutron' group='neutron'
--- a/components/openstack/neutron/files/neutron-l3-agent	Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-l3-agent	Tue Jul 12 11:18:58 2016 -0700
@@ -21,7 +21,7 @@
 import sys
 
 import netaddr
-from openstack_common import is_ml2_plugin
+from openstack_common import is_ml2_plugin, kill_contract
 import smf_include
 
 from neutron.agent.solaris import packetfilter
@@ -51,6 +51,49 @@
     return True
 
 
+def cleanup_l3_agent_datalinks():
+    cmd = ["/usr/sbin/dladm", "show-link", "-p", "-o", "link"]
+    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
+    output, error = p.communicate()
+    if p.returncode != 0:
+        print "failed to retrieve datalink names"
+        return smf_include.SMF_EXIT_ERR_FATAL
+
+    dlnames = output.splitlines()
+    # L3 agent datalinks are always 15 characters in length. They start
+    # with either 'l3i' or 'l3e', end with '_0', and in between they are
+    # hexadecimal digits.
+    prog = re.compile('l3[ie][0-9A-Fa-f\_]{10}_0')
+    retcode = smf_include.SMF_EXIT_OK
+    is_ml2 = is_ml2_plugin()
+    for dlname in dlnames:
+        if prog.search(dlname) is None:
+            continue
+        try:
+            # first remove the IP
+            check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
+                        dlname])
+        except:
+            # It is possible that the IP was already deleted but not the
+            # datalink. So we continue and try and delete the datalink.
+            pass
+        try:
+            # next remove the VNIC
+            check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
+                        dlname])
+            # remove the OVS Port
+            if is_ml2:
+                ovs_bridge = get_ovs_bridge(dlname)
+                if ovs_bridge:
+                    check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
+                                "--if-exists", "del-port", ovs_bridge, dlname])
+        except CalledProcessError as err:
+            print "failed to remove datalink '%s' used by L3 agent: %s" % \
+                (dlname, err)
+            retcode = smf_include.SMF_EXIT_ERR_FATAL
+    return retcode
+
+
 def start():
     # verify paths are valid
     for f in sys.argv[2:6]:
@@ -58,6 +101,16 @@
             print '%s does not exist or is not readable' % f
             return smf_include.SMF_EXIT_ERR_CONFIG
 
+    # We need to remove VNICs associated with L3 agent if any were left over.
+    # Before that, we need to first remove the PF rules added under
+    # _auto/neutron:l3:agent anchor and then remove the IP interfaces on which
+    # the rules were applied.
+    pf = packetfilter.PacketFilter('_auto/neutron:l3:agent')
+    pf.remove_anchor_recursively()
+    ret_code = cleanup_l3_agent_datalinks()
+    if ret_code != smf_include.SMF_EXIT_OK:
+        return ret_code
+
     # System-wide forwarding (either ipv4 or ipv6 or both) must be enabled
     # before neutron-l3-agent can be started.
     cmd = ["/usr/sbin/ipadm", "show-prop", "-c", "-p", "forwarding",
@@ -117,9 +170,9 @@
     return smf_include.smf_subprocess(cmd)
 
 
-def get_ovs_bridge(ifname):
+def get_ovs_bridge(dlname):
     # retrieve the right OVS bridge based on the interface name
-    if ifname.startswith('l3i'):
+    if dlname.startswith('l3i'):
         config_file = '/etc/neutron/plugins/openvswitch/ovs_neutron_plugin.ini'
         section = "ovs"
         option = "integration_bridge"
@@ -138,11 +191,16 @@
 
 def stop():
     shutdown_vpn()
-    try:
-        # first kill the SMF contract
-        check_call(["/usr/bin/pkill", "-c", sys.argv[2]])
-    except CalledProcessError as err:
-        print "failed to kill the SMF contract: %s" % (err)
+    # Keep issuing SIGTERM until the contract is empty. This way we will catch
+    # any child processes missed because they were getting forked.
+    # 50 attempts will be made at intervals of 2 seconds. Typically, we
+    # will only need 0 or 1 additional attempt before the contract is empty but
+    # we chose to err on the side of caution. In the worst case, we will use
+    # 100 seconds in the below loop which will leave 500 seconds (timeout is
+    # 600s) for the other cleanup tasks, after which the service will be put to
+    # maintenance state if the contract was not killed successfully.
+    if not kill_contract(50, 2, sys.argv[2]):
+        return smf_include.SMF_EXIT_ERR_FATAL
 
     # We need to first remove the PF rules added under _auto/neutron:l3:agent
     # anchor and then remove the IP interfaces on which the rules were applied.
@@ -150,42 +208,10 @@
     pf.remove_anchor_recursively()
 
     # remove VNICs associated with L3 agent
-    cmd = ["/usr/sbin/ipadm", "show-if", "-p", "-o", "ifname"]
-    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
-    output, error = p.communicate()
-    if p.returncode != 0:
-        print "failed to retrieve IP interface names"
-        return smf_include.SMF_EXIT_ERR_CONFIG
-
-    ifnames = output.splitlines()
-    # L3 agent datalinks are always 15 characters in length. They start
-    # with either 'l3i' or 'l3e', end with '_0', and in between they are
-    # hexadecimal digits.
-    prog = re.compile('l3[ie][0-9A-Fa-f\_]{10}_0')
-    err_delete = False
-    for ifname in ifnames:
-        if prog.search(ifname) is None:
-            continue
-        try:
-            # first remove the IP
-            check_call(["/usr/bin/pfexec", "/usr/sbin/ipadm", "delete-ip",
-                        ifname])
-            # next remove the VNIC
-            check_call(["/usr/bin/pfexec", "/usr/sbin/dladm", "delete-vnic",
-                        ifname])
-            # remove the OVS Port
-            if is_ml2_plugin():
-                ovs_bridge = get_ovs_bridge(ifname)
-                if ovs_bridge:
-                    check_call(["/usr/bin/pfexec", "/usr/sbin/ovs-vsctl", "--",
-                                "--if-exists", "del-port", ovs_bridge, ifname])
-        except CalledProcessError as err:
-            print "failed to remove datalink '%s' used by L3 agent: %s" % \
-                (ifname, err)
-            err_delete = True
+    ret_code = cleanup_l3_agent_datalinks()
 
     # finally reset the hostmodel property
-    if not set_hostmodel("weak") or err_delete:
+    if not set_hostmodel("weak") or ret_code != smf_include.SMF_EXIT_OK:
         return smf_include.SMF_EXIT_ERR_FATAL
     return smf_include.SMF_EXIT_OK
 
--- a/components/openstack/neutron/files/neutron-l3-agent.xml	Tue Jul 12 11:11:21 2016 -0700
+++ b/components/openstack/neutron/files/neutron-l3-agent.xml	Tue Jul 12 11:18:58 2016 -0700
@@ -63,7 +63,7 @@
 
     <logfile_attributes permissions='600'/>
 
-    <exec_method timeout_seconds="60" type="method" name="start"
+    <exec_method timeout_seconds="600" type="method" name="start"
       exec="/lib/svc/method/neutron-l3-agent %m %{config/config_path} %{config/l3_config_path} %{config/vpn_config_path} %{config/ml2_config_path}">
       <method_context>
         <method_credential user='neutron' group='neutron'/>