accel-ppp.org

Posted: **11 Aug 2020, 22:36**

I've seen a situation which looks like a bug to me. It happens in a redundant / load balancing setup with two (or more) PPPoE servers on the same network. I've never seen it happen while running a similar setup with two MikroTik PPPoE servers for a long time earlier on the same network with mostly the same set of a few hundreds of clients.

Most of the time it works fine, but a few clients seem to be stuck connected to both PPPoE servers at the same time. Both servers periodically sent LCP echo requests to the client, and both get LCP echo replies. This continues indefinitely (hours) - even though the client really has a PPPoE session only with one server, it still responds to LCP echo requests for the no longer active session, keeping it alive at the server. Now, since I assign a static IP to each PPPoE username, this results in duplicate IP on both PPPoE servers which "redistribute connected" into dynamic routing protocols (previously OSPF, now iBGP following the recent BCP documents), and all sorts of problems for that client (they report Internet disappearing frequently, some web pages failing to load etc. as part of the traffic comes through the inactive PPPoE session).

When I looked at the traffic with tcpdump, I see these suspicious things:
- magic number is the same in LCP echo replies to both servers
- session IDs have only 10 bits really used, low 6 bits always zero

Looking at the code I see the check for looped-back link is done as in the RFC1661, but this part seems not implemented:
"Reception of a Magic-Number other than the negotiated local Magic-Number, the peer's negotiated Magic-Number, or zero if the peer didn't negotiate one, indicates a link which has been (mis)configured for communications with a different peer."

Also, probability of duplicate session ID (with only 10 of 16 bits used) is quite high. Not sure if this isn't also a buggy client (probably should respond only to LCP echo requests from the same MAC address of the server to which it has an active session, and also check the magic number), but we can't really change the clients (customers are free to use all sorts of their own cheap routers as long as they do PPPoE). Perhaps it could at least be fixed at the accel-ppp end. MikroTik seems to get this right (of course I haven't seen the code, but have been running such a redundant setup for a few years and have never seen a stuck duplicate PPPoE session on two servers that wouldn't clear by itself).

Posted: **20 Aug 2020, 19:02**

Proposed patch (warning: not tested yet other than it compiles):
- use uint32_t consistently for LCP magic numbers instead of (signed) int
- use mrand48() instead of random() for full 32 bits of randomness (like pppd does)
- remember peer's negotiated magic in lcp->peer_magic
- check it in received LCP echo requests and responses, ignore if not matching

The Magic-Number field of these packets SHOULD be inspected on
reception. All received Magic-Number fields MUST be equal to
either zero or the peer's unique Magic-Number, depending on
whether or not the peer negotiated a Magic-Number.

Code: Select all

diff --git a/accel-pppd/ppp/lcp_opt_magic.c b/accel-pppd/ppp/lcp_opt_magic.c
index 52ed26e..feffa60 100644
--- a/accel-pppd/ppp/lcp_opt_magic.c
+++ b/accel-pppd/ppp/lcp_opt_magic.c
@@ -20,7 +20,7 @@ static void magic_print(void (*print)(const char *fmt, ...), struct lcp_option_t
 struct magic_option_t
 {
 	struct lcp_option_t opt;
-	int magic;
+	uint32_t magic;
 };
 
 static struct lcp_option_handler_t magic_opt_hnd=
@@ -35,12 +35,12 @@ static struct lcp_option_handler_t magic_opt_hnd=
 	.print = magic_print,
 };
 
-static int nzmagic(int old)
+static uint32_t nzmagic(uint32_t old)
 {
-	int magic;
+	uint32_t magic;
 
 	do {
-		magic = random();
+		magic = mrand48();
 	} while (magic == old || !magic);
 
 	return magic;
@@ -56,6 +56,7 @@ static struct lcp_option_t *magic_init(struct ppp_lcp_t *lcp)
 	magic_opt->opt.len = 6;
 
 	lcp->magic = magic_opt->magic;
+	lcp->peer_magic = 0;
 
 	return &magic_opt->opt;
 }
@@ -106,6 +107,7 @@ static int magic_recv_conf_req(struct ppp_lcp_t *lcp, struct lcp_option_t *opt,
 	if (magic_opt->magic && magic_opt->magic == ntohl(opt32->val))
 		return LCP_OPT_NAK;
 
+	lcp->peer_magic = ntohl(opt32->val);
 	return LCP_OPT_ACK;
 }
 
@@ -115,6 +117,7 @@ static int magic_recv_conf_rej(struct ppp_lcp_t *lcp, struct lcp_option_t *opt,
 
 	magic_opt->magic = 0;
 	lcp->magic = 0;
+	lcp->peer_magic = 0;
 
 	return 0;
 }
@@ -150,6 +153,11 @@ static void magic_print(void (*print)(const char *fmt, ...), struct lcp_option_t
 
 static void magic_opt_init()
 {
+	struct timeval t;
+
+	gettimeofday(&t, NULL);
+	srand48(t.tv_sec ^ t.tv_usec);
+
 	lcp_option_register(&magic_opt_hnd);
 }
 
diff --git a/accel-pppd/ppp/ppp_lcp.c b/accel-pppd/ppp/ppp_lcp.c
index fdbb318..759121c 100644
--- a/accel-pppd/ppp/ppp_lcp.c
+++ b/accel-pppd/ppp/ppp_lcp.c
@@ -594,6 +594,9 @@ static void lcp_recv_echo_repl(struct ppp_lcp_t *lcp, uint8_t *data, int size)
 		if (lcp->magic && magic == lcp->magic) {
 			log_ppp_error("lcp: echo: loop-back detected\n");
 			ap_session_terminate(&lcp->ppp->ses, TERM_NAS_ERROR, 0);
+		} else if (magic && lcp->peer_magic && magic != lcp->peer_magic) {
+			log_ppp_warn("lcp: echo reply ignored, wrong magic %08x, expected %08x\n", magic, lcp->peer_magic);
+			return;
 		}
 	}
 
@@ -604,7 +607,12 @@ static void lcp_recv_echo_repl(struct ppp_lcp_t *lcp, uint8_t *data, int size)
 static void send_echo_reply(struct ppp_lcp_t *lcp)
 {
 	struct lcp_hdr_t *hdr = (struct lcp_hdr_t*)lcp->ppp->buf;
-	//uint32_t magic = *(uint32_t *)(hdr + 1);
+	uint32_t magic = ntohl(*(uint32_t *)(hdr + 1));
+
+	if (magic && lcp->peer_magic && magic != lcp->peer_magic) {
+		log_ppp_warn("lcp: echo request ignored, wrong magic %08x, expected %08x\n", magic, lcp->peer_magic);
+		return;
+	}
 
 	lcp->echo_sent = 0;
 	lcp->last_echo_ts = _time();
diff --git a/accel-pppd/ppp/ppp_lcp.h b/accel-pppd/ppp/ppp_lcp.h
index 10c55b0..386ab0c 100644
--- a/accel-pppd/ppp/ppp_lcp.h
+++ b/accel-pppd/ppp/ppp_lcp.h
@@ -121,7 +121,8 @@ struct ppp_lcp_t
 
 	struct triton_timer_t echo_timer;
 	int echo_sent;
-	int magic;
+	uint32_t magic;
+	uint32_t peer_magic;
 	unsigned long last_ipackets;
 	time_t last_echo_ts;

Posted: **13 Nov 2020, 09:38**

any check this patch i ok or not

accel-ppp.org

PPPoE session still open after client connects to different server

PPPoE session still open after client connects to different server

Re: PPPoE session still open after client connects to different server

Re: PPPoE session still open after client connects to different server