For better or worse, this is how I have things set up now:
mongo client connects to:
-> AWS Internal Network (TCP) Load balancer
-> Traefik (reverse proxy)
-> mongo (from docker image mongo:4.4.17)
(This setup was a quick hack to get mongodb in place. I've done a lot of reading debugging this issue and learned maybe my setup isn't the best. But I've found nothing to suggest it shouldn't work.)
Issue:
Connections will work until they don't. My 'test' is a simple node script that opens a db connection and execute db.find(), then logs that. It works a few times, then will hang. The mongodb logs suggest the client reaches its timeout and disconnects. Other than the client-side timeout, I see no errors to explain what mongo was doing for 30s.
If I make requests directly to Traefik or to mongo this issue goes away entirely. So it must be related to the request going through the NLB.
Some Logs:
mongodb logs (level 0): Test script (worked, got valid repsonse and exited cleanly):
{"t":{"$date":"2022-11-08T18:46:15.848+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.21.43:57206","connectionId":13,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:15.853+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn13","msg":"client metadata","attr":{"remote":"10.20.21.43:57206","client":"conn13","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:15.862+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:42852","connectionId":14,"connectionCount":4}}
{"t":{"$date":"2022-11-08T18:46:15.862+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn14","msg":"client metadata","attr":{"remote":"10.20.31.176:42852","client":"conn14","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:15.878+00:00"},"s":"I", "c":"ACCESS", "id":20250, "ctx":"conn14","msg":"Authentication succeeded","attr":{"mechanism":"SCRAM-SHA-256","speculative":true,"principalName":"user","authenticationDatabase":"tracker","remote":"10.20.31.176:42852","extraInfo":{}}}
{"t":{"$date":"2022-11-08T18:46:15.892+00:00"},"s":"I", "c":"NETWORK", "id":22944, "ctx":"conn13","msg":"Connection ended","attr":{"remote":"10.20.21.43:57206","connectionId":13,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:15.894+00:00"},"s":"I", "c":"NETWORK", "id":22944, "ctx":"conn14","msg":"Connection ended","attr":{"remote":"10.20.31.176:42852","connectionId":14,"connectionCount":2}}
Exact same test script, run a few seconds later that timed out after 30 seconds:
{"t":{"$date":"2022-11-08T18:46:37.143+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.21.43:49370","connectionId":15,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:37.147+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn15","msg":"client metadata","attr":{"remote":"10.20.21.43:49370","client":"conn15","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:43.980+00:00"},"s":"I", "c":"STORAGE", "id":22430, "ctx":"WTCheckpointThread","msg":"WiredTiger message","attr":{"message":"[1667933203:980496][1:0x7fc8a881e700], WT_SESSION.checkpoint: [WT_VERB_CHECKPOINT_PROGRESS] saving checkpoint snapshot min: 8, snapshot max: 8 snapshot count: 0, oldest timestamp: (0, 0) , meta checkpoint timestamp: (0, 0) base write gen: 195129"}}
{"t":{"$date":"2022-11-08T18:46:47.658+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:42300","connectionId":16,"connectionCount":4}}
{"t":{"$date":"2022-11-08T18:46:47.658+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn16","msg":"client metadata","attr":{"remote":"10.20.31.176:42300","client":"conn16","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I", "c":"-", "id":20883, "ctx":"conn15","msg":"Interrupted operation as its client disconnected","attr":{"opId":2510}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I", "c":"NETWORK", "id":22944, "ctx":"conn15","msg":"Connection ended","attr":{"remote":"10.20.21.43:49370","connectionId":15,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I", "c":"NETWORK", "id":22944, "ctx":"conn16","msg":"Connection ended","attr":{"remote":"10.20.31.176:42300","connectionId":16,"connectionCount":2}}
Same test script - Log Level 3: Unsuccessful (timeout at 30s)
{"t":{"$date":"2022-11-08T18:56:29.522+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:57422","connectionId":2,"connectionCount":2}}
{"t":{"$date":"2022-11-08T18:56:29.523+00:00"},"s":"D3", "c":"EXECUTOR", "id":22983, "ctx":"listener","msg":"Starting new executor thread in passthrough mode"}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn2","msg":"client metadata","attr":{"remote":"10.20.31.176:57422","client":"conn2","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"D3", "c":"NETWORK", "id":22934, "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D3", "c":"NETWORK", "id":22936, "ctx":"conn2","msg":"No compressors provided"}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"},"numYields":0,"reslen":329,"locks":{},"protocol":"op_query","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D1", "c":"QUERY", "id":22790, "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":317}}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D2", "c":"QUERY", "id":22783, "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:30.034+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:30.034+00:00"},"s":"D3", "c":"FTDC", "id":23904, "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:36.026+00:00"},"s":"D2", "c":"WRITE", "id":21976, "ctx":"abortExpiredTransactions","msg":"Scanning sessions","attr":{"sessionCount":0}}
{"t":{"$date":"2022-11-08T18:56:40.037+00:00"},"s":"I", "c":"NETWORK", "id":22943, "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:57284","connectionId":3,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:56:40.037+00:00"},"s":"D3", "c":"EXECUTOR", "id":22983, "ctx":"listener","msg":"Starting new executor thread in passthrough mode"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn3","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"I", "c":"NETWORK", "id":51800, "ctx":"conn3","msg":"client metadata","attr":{"remote":"10.20.31.176:57284","client":"conn3","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D3", "c":"NETWORK", "id":22934, "ctx":"conn3","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D3", "c":"NETWORK", "id":22936, "ctx":"conn3","msg":"No compressors provided"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn3","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"},"numYields":0,"reslen":329,"locks":{},"protocol":"op_query","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D1", "c":"QUERY", "id":22790, "ctx":"conn3","msg":"Received interrupt request for unknown op","attr":{"opId":452}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D2", "c":"QUERY", "id":22783, "ctx":"conn3","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"NETWORK", "id":22934, "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"NETWORK", "id":22935, "ctx":"conn2","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"FTDC", "id":23905, "ctx":"conn2","msg":"Using exhaust for isMaster or hello protocol"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"reslen":323,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D1", "c":"QUERY", "id":22790, "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":326}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D2", "c":"QUERY", "id":22783, "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:40.045+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:40.045+00:00"},"s":"D3", "c":"FTDC", "id":23904, "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn3","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":1,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D3", "c":"NETWORK", "id":22934, "ctx":"conn3","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D3", "c":"NETWORK", "id":22935, "ctx":"conn3","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn3","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":1,"$db":"admin"},"numYields":0,"reslen":304,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D1", "c":"QUERY", "id":22790, "ctx":"conn3","msg":"Received interrupt request for unknown op","attr":{"opId":579}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D2", "c":"QUERY", "id":22783, "ctx":"conn3","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"NETWORK", "id":22934, "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"NETWORK", "id":22935, "ctx":"conn2","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"FTDC", "id":23905, "ctx":"conn2","msg":"Using exhaust for isMaster or hello protocol"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"reslen":323,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D1", "c":"QUERY", "id":22790, "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":453}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D2", "c":"QUERY", "id":22783, "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D2", "c":"COMMAND", "id":21965, "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"FTDC", "id":23904, "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:54.149+00:00"},"s":"D1", "c":"EXECUTOR", "id":23106, "ctx":"AuthorizationManager-0","msg":"Reaping this thread","attr":{"nextThreadRetirementDate":{"$date":"2022-11-08T18:57:24.149Z"}}}
{"t":{"$date":"2022-11-08T18:56:54.149+00:00"},"s":"D1", "c":"EXECUTOR", "id":23105, "ctx":"AuthorizationManager-0","msg":"Shutting down thread","attr":{"threadName":"AuthorizationManager-0","poolName":"AuthorizationManager"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I", "c":"-", "id":20883, "ctx":"conn2","msg":"Interrupted operation as its client disconnected","attr":{"opId":580}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D3", "c":"-", "id":4892201, "ctx":"conn2","msg":"Internal assertion","attr":{"error":{"code":279,"codeName":"ClientDisconnect","errmsg":"operation was interrupted"},"location":"{fileName:\"src/mongo/util/interruptible.h\", line:398, functionName:\"operator()\"}"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D2", "c":"NETWORK", "id":22986, "ctx":"conn3","msg":"Session from remote encountered a network error during SourceMessage","attr":{"remote":"10.20.31.176:57284","error":{"code":6,"codeName":"HostUnreachable","errmsg":"Connection closed by peer"}}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I", "c":"NETWORK", "id":22944, "ctx":"conn3","msg":"Connection ended","attr":{"remote":"10.20.31.176:57284","connectionId":3,"connectionCount":2}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D1", "c":"COMMAND", "id":21962, "ctx":"conn2","msg":"Assertion while executing command","attr":{"command":"hello","db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"error":"ClientDisconnect: operation was interrupted"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"ok":0,"errMsg":"operation was interrupted","errName":"ClientDisconnect","errCode":279,"reslen":117,"locks":{},"protocol":"op_msg","durationMillis":0}}
Test Script:
const mongodb = require("mongodb");
let url;
//url = 'mongodb://user:[email protected]:3204/tracker';
//url = 'mongodb://user:[email protected]:3204/tracker';
//url = 'mongodb://user:[email protected]:3204/tracker';
url = 'mongodb:/user:[email protected]:3204/tracker';
console.log(url);
dbclient = new mongodb.MongoClient(url, {auto_reconnect:true, ssl: false, poolSize: 16, keepAlive: true, retryWrites:false, useUnifiedTopology: true})
console.log(dbclient);
dbclient.connect(function(err) {
if (err) {
console.error("connection error:", err);
}
db = dbclient.db('tracker')
db.collection('visitors', function(a,b){
b.find({aid:7}).toArray(function(err, items) {
console.log(items.length);
console.log(items[0]);
dbclient.close();
});
});
});
We had the same problem. We have a MongoDB exposed via an AWS NLB deployed within our Kubernetes Cluster. We could test connections were timing out by running
curl nlb_address:27017 -v
from a dummy pod inside of our cluster.Upon further investigating we came across
https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-troubleshooting.html#loopback-timeout
This made sense for us as the connections which were being dropped came from within our kubernetes cluster, our mongodb was also deployed within our kubernetes cluster.
Solution:
Disable Client IP preservation.
on the LB Target Group.Which can either be done via the AWS Load Balancer Controller with the following annotation: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.6/guide/service/annotations/
service.beta.kubernetes.io/aws-load-balancer-target-group-attributes: preserve_client_ip.enabled=true
OR via the AWS Dashboard: Toggle Off
Preserve Client IP Addresses
After toggling off and rerunning a few curl tests, we noticed that we were no longer being timedout.
notes:
Cross Zone Load Balancing
on the LB which did not work for us.references:
https://build.thebeat.co/a-curious-case-of-aws-nlb-timeouts-in-kubernetes-522bd88a3399
https://repost.aws/questions/QUbwjtwhGOTZ-h2eTy3MCjsw/aws-eks-internal-network-load-balancer-nlb-timed-out-while-trying-to-connect-from-underlying-eks-worker-node