1 files changed, 45 insertions, 0 deletions
diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
index 4d7f32de..271a21a0 100644
--- a/bgpd/bgp_packet.c
+++ b/bgpd/bgp_packet.c
@@ -1307,6 +1307,51 @@ bgp_open_receive (struct peer *peer, bgp_size_t size)
 		 && realpeer->status != OpenConfirm)
 
  	{
+ 	  /* XXX: This is an awful problem.. 
+ 	   *
+ 	   * According to the RFC we should just let this connection (of the
+ 	   * accepted 'peer') continue on to Established if the other
+ 	   * connection (the 'realpeer' one) is in state Connect, and deal
+ 	   * with the more larval FSM as/when it gets far enough to receive
+ 	   * an Open. We don't do that though, we instead close the (more
+ 	   * developed) accepted connection.
+ 	   *
+ 	   * This means there's a race, which if hit, can loop:
+ 	   *
+ 	   *       FSM for A                        FSM for B
+ 	   *  realpeer     accept-peer       realpeer     accept-peer
+ 	   *
+ 	   *  Connect                        Connect
+ 	   *               Active
+ 	   *               OpenSent          OpenSent
+ 	   *               <arrive here,
+ 	   *               Notify, delete>   
+ 	   *                                 Idle         Active
+ 	   *   OpenSent                                   OpenSent
+ 	   *                                              <arrive here,
+ 	   *                                              Notify, delete>
+ 	   *   Idle
+ 	   *   <wait>                        <wait>
+ 	   *   Connect                       Connect
+ 	   *
+           *
+ 	   * If both sides are Quagga, they're almost certain to wait for
+ 	   * the same amount of time of course (which doesn't preclude other
+ 	   * implementations also waiting for same time). The race is
+ 	   * exacerbated by high-latency (in bgpd and/or the network).
+ 	   *
+ 	   * The reason we do this is because our FSM is tied to our peer
+ 	   * structure, which carries our configuration information, etc. 
+ 	   * I.e. we can't let the accepted-peer FSM continue on as it is,
+ 	   * cause it's not associated with any actual peer configuration -
+ 	   * it's just a dummy.
+ 	   *
+ 	   * It's possible we could hack-fix this by just bgp_stop'ing the
+ 	   * realpeer and continueing on with the 'transfer FSM' below. 
+ 	   * Ideally, we need to seperate FSMs from struct peer.
+ 	   *
+ 	   * Setting one side to passive avoids the race, as a workaround.
+ 	   */
  	  if (BGP_DEBUG (events, EVENTS))
 	    zlog_debug ("%s peer status is %s close connection",
 			realpeer->host, LOOKUP (bgp_status_msg,