Intermittent error "This socket has been ended by the other party" #144

vpsouza · 2016-09-27T11:30:33Z

Hi, i'm connecting to my neo4j running on a docker container and i'm getting an error when i try to attempt to make another connection/query. The error is "This socket has been ended by the other party". This is my Dockerfile with the opened ports:

EXPOSE 7474 7473 7687

This is my driver utility:

var neo4j = require("neo4j-driver").v1,
    config = require("../../config");

(function(module){ 
    module.exports = function(){
        config.neo4j.server = 'localhost';
        return neo4j.driver("bolt://" + config.neo4j.server, neo4j.auth.basic(config.neo4j.user, config.neo4j.password));
    }
}(module));

The way i'm making my queries to neo4j:

(function(module){
    var driver = require('../../../utils/neo4j-driver')(),
        Q = require('q'),
        logger = require('../../../utils/logger'),
        BaseNeo4jModel = require('../../../utils/neo4j-model');

    function userDBAuth(user){
        var deferred = Q.defer();
        var session = driver.session();

        session
            //.run( "MATCH (a:Person) RETURN a.name AS name, a.uuid as uuid" )
            .run("MATCH (user:User {email:{email}})"
                    + "RETURN user",
                    { email: user.email})
            .then(function (result) {
                logger.debug('fetching results...');

                if(result.records.length > 0){
                    var records = [];

                    result.records.forEach(function(record){
                        records.push(new BaseNeo4jModel(record));
                    });

                    done();

                    deferred.resolve(records);
                } else {
                    deferred.reject({sucess: false, message: 'User not Found'});
                }

            }).catch( function(err) {
                logger.error(err);
                done();
                deferred.reject(err.fields[0]);
            });

            return deferred.promise;

        function done(){
            session.close();
            driver.close();
        }
    }

    module.exports = userDBAuth;

}(module));

And this is my stack trace:

{"name":"auth-services","hostname":"MacBook-Pro-de-Vinicius.local","pid":16292,"level":50,"err":{"message":"This socket has been ended by the other party","name":"Error","stack":"Error: This socket has been ended by the other party
at TLSSocket.writeAfterFIN [as write] (net.js:286:12)
at NodeChannel.write (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/neo4j-driver/lib/v1/internal/ch-node.js:285:20)
at Chunker.flush (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/neo4j-driver/lib/v1/internal/chunking.js:114:18)
at Connection.sync (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/neo4j-driver/lib/v1/internal/connector.js:487:21)
at Session.run (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/neo4j-driver/lib/v1/session.js:89:20)
at userDBAuth (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/app/api/auth/services/userDBAuth.js:13:7)
at Object._auth [as auth] (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/app/api/auth/services/auth.js:9:3)
at Server.auth (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/app/api/auth/resources/auth.js:7:12)
at next (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/restify/lib/server.js:906:30)
at f (/Users/viniciussouza/Sites/WeddSocial/backend/weddsocial-auth/node_modules/once/once.js:25:25)","code":"EPIPE"},"msg":"This socket has been ended by the other party","time":"2016-09-27T11:12:33.163Z","v":0}

I'm desperate, please help me!

The text was updated successfully, but these errors were encountered:

vpsouza · 2016-09-28T16:16:25Z

Same issue, with other bolt driver (this time java driver)....

Neo4j Bolt driver #229

viz · 2016-10-31T23:52:15Z

just saw the same symptoms with the javascript bolt driver. Using version 1.1.0-M02 : Query failing with this error:

at TLSSocket.writeAfterFIN [as write] (net.js line 284 col 12)
2
at NodeChannel.write (/app/node_modules/neo4j-driver/lib/v1/internal/ch-node.js line 329 col 20)
this._conn.write(buffer._buffer);
3
at Chunker.flush (/app/node_modules/neo4j-driver/lib/v1/internal/chunking.js line 114 col 18)
this._ch.write(out.getSlice(0, out.position));
4
at Connection.sync (/app/node_modules/neo4j-driver/lib/v1/internal/connector.js line 489 col 21)
this._chunker.flush();
5
at Session.run (/app/node_modules/neo4j-driver/lib/v1/session.js line 102 col 20)
this._conn.sync();
6
at Order.updateAllRights (/app/models/order.js line 300 col 13)
dbSession.run(query, queryParams)

viz · 2016-11-01T01:32:20Z

is anyone maintaining this driver? There's been no post on this issue for more than a month..

dozoisch · 2016-11-23T23:29:36Z

@viz I had the same issue, I checked the latest version of the driver from GitHub and so far so good. I'll update you if it starts again. "neo4j-driver": "neo4j/neo4j-javascript-driver#56c7995cd26bd8fc9b7769b2a42e99bb24087c80",

dozoisch · 2016-11-24T17:14:35Z

No, nevermind, still happening.

zhenlineo · 2016-11-24T17:46:49Z

Hi, sorry for long waiting time.
Some question to help to locate the error: Do you have any load balancer in the middle? Do you have any timeout on the connection with the bolt driver?

Could @vpsouza you share the docker image that you are using and give us some instruction how to re-produce this error?

dozoisch · 2016-11-24T17:48:49Z

@zhenlineo I'm using HAProxy in the middle, configured as it was said in the HAProxy doc something really close to this

This is the stacktrace based on commit 56c7995

Error: This socket has been ended by the other party
  at TLSSocket.writeAfterFIN [as write] (net.js, line 291 col 12)
  at NodeChannel.write (node_modules/neo4j-driver/lib/v1/internal/ch-node.js, line 329 col 20) 
  at Chunker.flush (node_modules/neo4j-driver/lib/v1/internal/chunking.js, line 114 col 18)
  at Connection.sync (node_modules/neo4j-driver/lib/v1/internal/connector.js, line 489 col 21)
  at Session.run (node_modules/neo4j-driver/lib/v1/session.js, line 102 col 20)

zhenlineo · 2016-11-24T18:10:55Z

Hi @dozoisch, your connection will get killed by HAProxy after 30s timeout? (Not sure the HAProxy setting means an idle timeout or just kill the tcp connection after 30s)

Some more info about this neo4j/neo4j-java-driver#213
Where a user provides a nice suggestion to do re-tries to avoid the problem.

dozoisch · 2016-11-24T18:16:21Z

@zhenlineo so the "timeout connect" is timeout when establishing the connection. The idle timeout is the "timeout client". (see http://serverfault.com/a/778222/138439 for more info)

I'm currently putting in place a retry mechanism. So that when this particular error happen I retry a number of times. I'll report on how that goes!

dozoisch · 2016-11-25T02:16:21Z

So I'm doing something similar to this:

const maxRetry = 4;
function doRun(cypher, params, numberOfTry) {
  const session = driver.session();
  return Promise.resolve().then(() => session.run(cypher, params))
    .then(
      results => {
        session.close();
        return results;
      },
      err => {
        session.close();
        if (numberOfTry < maxRetry && err.message.indexOf('ended by the other party') > -1) {
          // Try again
          return doRun(cypher, params, ++numberOfTry);
        }
        return Promise.reject(err);
      }
    );
}

exports.run = function(cypher, params) {
  return doRun(cypher, params, 0);
};

And it seems to be working so far. I've put the retry count higher than what is probably needed, but I'm logging when it happens so I'll be able to tweak it. It seems to be working so far!

It's not the perfect solution, but it does work around the issue.

zhenlineo · 2016-11-28T08:48:36Z

@dozoisch You might want to set the retry time to be higher than pool size which you could set in the config when you construct the driver. The driver will first try to reuse connections from the connection pool and then roll back to create new connections. So for the worst case, you need to retry poolSize + 1 times to get a fresh connection with the server.

dozoisch · 2016-11-28T17:10:27Z

Oh good idea!

louisiukas · 2016-12-19T10:33:44Z

I get this error in the following scenario:

Using neo4j 3.0.7 docker image and neo4j-javascript-driver 1.0.5
I'm setting up the connection to the db once using const neo4j=require('neo4j-driver').v1; const neo4jDriver =neo4j.driver(..), while using short lived sessions.
If the database has been down, but is up again and ready, at least the next session I request (var session=neo4jDriver.session();) will fail.
- It fails once it is used: session.run(...);
This is a bug in my book. A fresh session should not fail if the db is ready.
After one or more failed sessions, the connection works again.

lutovich · 2016-12-20T14:59:25Z

Hello @vpsouza, @viz, @louisiukas. Do you also use any proxy or load balancer between JS driver and Neo4j database?

Current implementation of the connection pool does not have any "test on borrow" or "idle connection refresh" functionality. So if idle pooled connection is killed/broken it'll just be given to the user code as is.

booboothefool · 2017-01-04T03:42:05Z

Hey @lutovich. I also seem to be getting this problem in my Heroku app, but not localhost.

I'm not sure, but I believe they use proxy/load balancer: https://devcenter.heroku.com/articles/how-heroku-works I also have IP-based geolocation in my app where on localhost it gets the proper location, but on Heroku it gets the server's location, if that gives you any idea of whether or not they use proxy/load balancer.

Update: I can confirm this is only happening on my Heroku app, and not localhost.

booboothefool · 2017-01-05T17:30:04Z

@zhenlineo @lutovich Is this an issue that will/can be fixed, or should I just implement retries in my code?

lutovich · 2017-01-05T18:01:05Z

@booboothefool I think it is better to have retries in your code for now. We'll try to work out a way to fix/mitigate this problem.

louisiukas · 2017-01-09T12:39:15Z

FYI: I don't use a load balancer but my app and database are running in separate linked docker containers (sandboxed on different ip addresses on same host). The connection breaks because I deliberately restart the neo4j-container to test for such a scenario in production.

lutovich · 2017-01-16T12:50:44Z

We've added test-on-borrow functionality to the Java driver (neo4j/neo4j-java-driver#297) and would like to get some feedback on that. If it turns out to be fine then similar feature will be added to the Javascript driver.

royipressburger · 2017-01-21T22:17:44Z

Also get similar issue:

EPIPE: undefined
  Error: This socket has been ended by the other party`

My neo4j machine is on aws behind an internal load balancer currently using version 1.0.5 of the driver..

Didn't had the chance to update to version 1.1.0 Ill update and keep you updated.
Thanks

royipressburger · 2017-01-22T17:40:53Z

@lutovich Is there anyway to temporary workaround this issue?

Can also add that even after removing the load balancer im still getting this issue.

lutovich · 2017-01-23T11:07:05Z

@royipressburger do you have any errors in the database logs debug.log or neo4j.log when connections were closed like this?
Now we can only suggest to do retries in the application code.

royipressburger · 2017-01-23T13:45:52Z

@lutovich Hi, I dont see any error in the logs.
And actually I do try to retry run each failed query and Im getting the same error (I try 3 times)
Im running the queries one after another.. Maybe I should sleep a second or two between them?

lutovich · 2017-01-23T13:52:40Z

@royipressburger query should be retried in a fresh session with some delay. Logging might be incomplete because of a problem fixed here: neo4j/neo4j#8662.
How often do your queries fail? I'm just thinking it might be a different problem if they always fail.

royipressburger · 2017-01-23T14:12:03Z

@lutovich Thanks for you answer ill try new session and a some delay.
Its hard to tell how often it happens, Its not always for sure, And its a bit more common with write queries.

booboothefool · 2017-01-23T17:38:49Z

Ever since I started doing this, I stopped seeing the error. Like it doesn't even need to retry... I literally do not see "socket has been ended by the other party" anymore. Don't really get it, but I'm happy, as my app always fetches successfully. It's been a few weeks too. 👍

async getUserPersona(id) {
    let result = {};

    // I previously had const session = this.connector.session() here,
    // when it was here, I got the "socket has been ended" issue

    try {
      const cypherGetUserPersona = `
        MATCH (:User {user_id: { id }})-[:PERSONIFIES]->(persona:Persona)
        OPTIONAL MATCH (persona)-[:SOURCE]->(source)
        RETURN persona, collect(source) as sources
      `;
      const queryResult = await retry(this.connector._bolt._pool._maxIdle + 1, async () => {

        // I moved it here, and all is good

        const session = this.connector.session();
        try {
          const res = await session.run(cypherGetUserPersona, { id });
          session.close();
          return res;
        } catch (neo4jError) {
          throw new Error(neo4jError);
        }
      });
      if (queryResult.records[0]) {
        const persona = queryResult.records[0].get('persona').properties;
        const sources = queryResult.records[0].get('sources').map(source => source.properties);
        result = Object.assign(persona, { sources });
      }
    } catch (error) {
      console.error('(getUserPersona) =>\t'.red, error);
    }
    return result;
  }

royipressburger · 2017-01-23T20:49:38Z

All you did is to move the location of the session creation? Looks odd

booboothefool · 2017-02-01T04:16:57Z

@royipressburger Yeah... that's really all I did. It doesn't happen on reads for me anymore, but still happens on writes. I agree with you that it's definitely too common with writes.

booboothefool · 2017-02-15T17:32:23Z

@lutovich @zhenlineo

What do you suggest is the proper way to do this? From my code above, I realize this completely breaks my app when I have a good amount of users. I end up with out of memory errors on my GrapheneDB. Am I really supposed to be retrying poolSize + 1 times? Default is 50, so 51 times, really? Also, should I be closing the session if it errors?

async getUserPersona(id) {
    let result = {};
    try {
      const cypherGetUserPersona = `
        MATCH (:User {user_id: { id }})-[:PERSONIFIES]->(persona:Persona)
        OPTIONAL MATCH (persona)-[:SOURCE]->(source)
        RETURN persona, collect(source) as sources
      `;
      const queryResult = await retry(this.connector._bolt._pool._maxIdle + 1, async () => {
        const session = this.connector.session();
        try {
          const res = await session.run(cypherGetUserPersona, { id });
          session.close();
          return res;
        } catch (neo4jError) {
          throw new Error(neo4jError);
        }
      });
      if (queryResult.records[0]) {
        const persona = queryResult.records[0].get('persona').properties;
        const sources = queryResult.records[0].get('sources').map(source => source.properties);
        result = Object.assign(persona, { sources });
      }
    } catch (error) {
      console.error('(getUserPersona) =>\t'.red, error);
    }
    return result;
  }

I switched it to this (only retry 3 times with 1500ms delay) and it seems to not destroy my servers with out of memory, also session.close() on the error, but really need to hear your suggestions as this is starting to give me a lot of problems in production now that I have more users.

const RETRY_COUNT = 3;
const RETRY_INTERVAL = 1500;

  async getUserPersona(id) {
    let result = {};
    try {
      const cypherGetUserPersona = `
        MATCH (:User {user_id: { id }})-[:PERSONIFIES]->(persona:Persona)
        OPTIONAL MATCH (persona)-[:SOURCE]->(source)
        RETURN persona, collect(source) as sources
      `;
      const queryResult = await retryInterval(RETRY_COUNT, RETRY_INTERVAL, async () => {
        const session = this.connector.session();
        try {
          const res = await session.run(cypherGetUserPersona, { id });
          session.close();
          return res;
        } catch (neo4jError) {
          session.close();
          throw new Error(neo4jError);
        }
      });
      if (queryResult.records[0]) {  // User has Persona
        const persona = queryResult.records[0].get('persona').properties;
        const sources = queryResult.records[0].get('sources').map(source => source.properties);
        result = Object.assign(persona, { sources });
      }
    } catch (error) {
      console.error('(getUserPersona) =>\t'.red, error);
    }
    return result;
  }

How would you write this?

Thank you.

lutovich · 2017-02-23T09:13:57Z

Hi @booboothefool,

Sorry for a late reply. Code with retryInterval looks good to me. It is a good idea to retry with a delay. You can also close session in the finally block but this should not change anything.
What library do you use for retryInterval?
What heap memory settings do you have for the GrapheneDB?

booboothefool · 2017-02-23T17:03:40Z

Hey @lutovich,

Yeah, it's been about a week and things seem to be working well with 3 retries, 1.5 second delay. poolSize + 1 = 51 retries seems like way too much... should probably just give up after 3 failures... lol.

I'm using some unknown little library for the retryInterval, but it works great! https://github.com/jlipps/asyncbox#retry

lutovich · 2017-02-24T13:13:03Z

Hi @booboothefool,

Heap size is not very big. Multiple back-to-back retries might cause database to spin up many new threads to serve new connections from the driver. Threads are really heavy objects, maybe this caused OOMs. This is pure speculation...

What kind of errors do you see when retry happens? Are they only "This socket has been ended by the other party"?

We plan to add API with retries for 1.2 release. It will probably be useful in this case.

booboothefool · 2017-03-23T08:26:19Z

@lutovich Yeah, only "This socket has been ended by other party".

I see 1.2 has been released. Is it recommended to switch over?

If so, can you show an example of how to use the retry API?

lutovich · 2017-03-24T14:42:20Z

@booboothefool I just added a readme item about new retry API here: #224
Full blown documentation is on it's way and will be available soon.

technige · 2017-06-15T13:15:37Z

Connection Management [2 + max 4]

zhenlineo · 2018-05-11T08:49:32Z

Besides using retry function, the ultimate connection timeout issue has been addressed by connection pool management feature.
Read more about our connection pool management enhancement here

This issue shall not be seen anymore. Thanks for raising this issue and give us your feedbacks.
I am closing this issue now. Please be free to add and reopen if the problem still persists.

vpsouza mentioned this issue Sep 28, 2016

Neo4j Bolt driver neo4j/neo4j-ogm#229

Closed

jremyf mentioned this issue May 9, 2017

ECONNRESET - Session Expired #242

Closed

zhenlineo closed this as completed May 11, 2018

Intermittent error "This socket has been ended by the other party" #144

Intermittent error "This socket has been ended by the other party" #144

Comments

vpsouza commented Sep 27, 2016 • edited Loading

vpsouza commented Sep 28, 2016

viz commented Oct 31, 2016 • edited Loading

viz commented Nov 1, 2016

dozoisch commented Nov 23, 2016

dozoisch commented Nov 24, 2016

zhenlineo commented Nov 24, 2016

dozoisch commented Nov 24, 2016 • edited Loading

zhenlineo commented Nov 24, 2016

dozoisch commented Nov 24, 2016

dozoisch commented Nov 25, 2016

zhenlineo commented Nov 28, 2016

dozoisch commented Nov 28, 2016

louisiukas commented Dec 19, 2016 • edited Loading

lutovich commented Dec 20, 2016

booboothefool commented Jan 4, 2017 • edited Loading

booboothefool commented Jan 5, 2017

lutovich commented Jan 5, 2017

louisiukas commented Jan 9, 2017

lutovich commented Jan 16, 2017

royipressburger commented Jan 21, 2017

royipressburger commented Jan 22, 2017 • edited Loading

lutovich commented Jan 23, 2017

royipressburger commented Jan 23, 2017

lutovich commented Jan 23, 2017

royipressburger commented Jan 23, 2017

booboothefool commented Jan 23, 2017 • edited Loading

royipressburger commented Jan 23, 2017

booboothefool commented Feb 1, 2017 • edited Loading

booboothefool commented Feb 15, 2017 • edited Loading

lutovich commented Feb 23, 2017

booboothefool commented Feb 23, 2017

lutovich commented Feb 24, 2017

booboothefool commented Mar 23, 2017

lutovich commented Mar 24, 2017

technige commented Jun 15, 2017

zhenlineo commented May 11, 2018

vpsouza commented Sep 27, 2016 •

edited

Loading

viz commented Oct 31, 2016 •

edited

Loading

dozoisch commented Nov 24, 2016 •

edited

Loading

louisiukas commented Dec 19, 2016 •

edited

Loading

booboothefool commented Jan 4, 2017 •

edited

Loading

royipressburger commented Jan 22, 2017 •

edited

Loading

booboothefool commented Jan 23, 2017 •

edited

Loading

booboothefool commented Feb 1, 2017 •

edited

Loading

booboothefool commented Feb 15, 2017 •

edited

Loading