mongoose - Mongodb aggregation $group, restrict length of array -


i want group documents according field restrict number of documents grouped each value.

each message has conversation_id. need 10 or lesser number of messages each conversation_id.

i able group according following command can't figure out how restrict number of grouped documents apart slicing results message.aggregate({'$group':{_id:'$conversation_id',msgs:{'$push':{msgid:'$_id'}}}})

how limit length of msgs array each conversation_id 10?

modern

from mongodb 3.6 there "novel" approach using $lookup perform "self join" in same way original cursor processing demonstrated below.

since in release can specify "pipeline" argument $lookup source "join", means can use $match , $limit gather , "limit" entries array:

db.messages.aggregate([   { "$group": { "_id": "$conversation_id" } },   { "$lookup": {     "from": "messages",     "let": { "conversation": "$_id" },     "pipeline": [       { "$match": { "$expr": { "$eq": [ "$conversation_id", "$$conversation" ] } }},       { "$limit": 10 },       { "$project": { "_id": 1 } }     ],     "as": "msgs"   }} ]) 

you can optionally add additional projection after $lookup in order make array items values rather documents _id key, basic result there doing above.

there still outstanding server-9277 requests "limit push" directly, using $lookup in way viable alternative in interim.

note: there $slice introduced after writing original answer , mentioned "outstanding jira issue" in original content. whilst can same result small result sets, involve still "pushing everything" array , later limiting final array output desired length.

so that's main distinction , why it's not practical $slice large results. of course can alternately used in cases is.

there few more details on mongodb group values multiple fields either alternate usage.


original

as stated earlier, not impossible horrible problem.

actually if main concern resulting arrays going exceptionally large, best approach submit each distinct "conversation_id" individual query , combine results. in mongodb 2.6 syntax might need tweaking depending on language implementation is:

var results = []; db.messages.aggregate([     { "$group": {         "_id": "$conversation_id"     }} ]).foreach(function(doc) {     db.messages.aggregate([         { "$match": { "conversation_id": doc._id } },         { "$limit": 10 },         { "$group": {             "_id": "$conversation_id",             "msgs": { "$push": "$_id" }         }}     ]).foreach(function(res) {         results.push( res );     }); }); 

but depends on whether trying avoid. on real answer:


the first issue here there no function "limit" number of items "pushed" array. like, functionality not presently exist.

the second issue when pushing items array, cannot use $slice, or similar operator in aggregation pipeline. there no present way "top 10" results produced array simple operation.

but can produce set of operations "slice" on grouping boundaries. involved, , example here reduce array elements "sliced" "six" only. main reason here demonstrate process , show how without being destructive arrays not contain total want "slice" to.

given sample of documents:

{ "_id" : 1, "conversation_id" : 123 } { "_id" : 2, "conversation_id" : 123 } { "_id" : 3, "conversation_id" : 123 } { "_id" : 4, "conversation_id" : 123 } { "_id" : 5, "conversation_id" : 123 } { "_id" : 6, "conversation_id" : 123 } { "_id" : 7, "conversation_id" : 123 } { "_id" : 8, "conversation_id" : 123 } { "_id" : 9, "conversation_id" : 123 } { "_id" : 10, "conversation_id" : 123 } { "_id" : 11, "conversation_id" : 123 } { "_id" : 12, "conversation_id" : 456 } { "_id" : 13, "conversation_id" : 456 } { "_id" : 14, "conversation_id" : 456 } { "_id" : 15, "conversation_id" : 456 } { "_id" : 16, "conversation_id" : 456 } 

you can see there when grouping conditions 1 array ten elements , "five". want here reduce both top "six" without "destroying" array match "five" elements.

and following query:

db.messages.aggregate([     { "$group": {         "_id": "$conversation_id",         "first": { "$first": "$_id" },         "msgs": { "$push": "$_id" },     }},     { "$unwind": "$msgs" },     { "$project": {         "msgs": 1,         "first": 1,         "seen": { "$eq": [ "$first", "$msgs" ] }     }},     { "$sort": { "seen": 1 }},     { "$group": {         "_id": "$_id",         "msgs": {              "$push": {                "$cond": [ { "$not": "$seen" }, "$msgs", false ]             }         },         "first": { "$first": "$first" },         "second": { "$first": "$msgs" }     }},     { "$unwind": "$msgs" },     { "$project": {         "msgs": 1,         "first": 1,         "second": 1,         "seen": { "$eq": [ "$second", "$msgs" ] }     }},     { "$sort": { "seen": 1 }},     { "$group": {         "_id": "$_id",         "msgs": {              "$push": {                "$cond": [ { "$not": "$seen" }, "$msgs", false ]             }         },         "first": { "$first": "$first" },         "second": { "$first": "$second" },         "third": { "$first": "$msgs" }     }},     { "$unwind": "$msgs" },     { "$project": {         "msgs": 1,         "first": 1,         "second": 1,         "third": 1,         "seen": { "$eq": [ "$third", "$msgs" ] },     }},     { "$sort": { "seen": 1 }},     { "$group": {         "_id": "$_id",         "msgs": {              "$push": {                "$cond": [ { "$not": "$seen" }, "$msgs", false ]             }         },         "first": { "$first": "$first" },         "second": { "$first": "$second" },         "third": { "$first": "$third" },         "forth": { "$first": "$msgs" }     }},     { "$unwind": "$msgs" },     { "$project": {         "msgs": 1,         "first": 1,         "second": 1,         "third": 1,         "forth": 1,         "seen": { "$eq": [ "$forth", "$msgs" ] }     }},     { "$sort": { "seen": 1 }},     { "$group": {         "_id": "$_id",         "msgs": {              "$push": {                "$cond": [ { "$not": "$seen" }, "$msgs", false ]             }         },         "first": { "$first": "$first" },         "second": { "$first": "$second" },         "third": { "$first": "$third" },         "forth": { "$first": "$forth" },         "fifth": { "$first": "$msgs" }     }},     { "$unwind": "$msgs" },     { "$project": {         "msgs": 1,         "first": 1,         "second": 1,         "third": 1,         "forth": 1,         "fifth": 1,         "seen": { "$eq": [ "$fifth", "$msgs" ] }     }},     { "$sort": { "seen": 1 }},     { "$group": {         "_id": "$_id",         "msgs": {              "$push": {                "$cond": [ { "$not": "$seen" }, "$msgs", false ]             }         },         "first": { "$first": "$first" },         "second": { "$first": "$second" },         "third": { "$first": "$third" },         "forth": { "$first": "$forth" },         "fifth": { "$first": "$fifth" },         "sixth": { "$first": "$msgs" },     }},     { "$project": {          "first": 1,          "second": 1,          "third": 1,          "forth": 1,          "fifth": 1,          "sixth": 1,          "pos": { "$const": [ 1,2,3,4,5,6 ] }     }},     { "$unwind": "$pos" },     { "$group": {         "_id": "$_id",         "msgs": {             "$push": {                 "$cond": [                     { "$eq": [ "$pos", 1 ] },                     "$first",                     { "$cond": [                         { "$eq": [ "$pos", 2 ] },                         "$second",                         { "$cond": [                             { "$eq": [ "$pos", 3 ] },                             "$third",                             { "$cond": [                                 { "$eq": [ "$pos", 4 ] },                                 "$forth",                                 { "$cond": [                                     { "$eq": [ "$pos", 5 ] },                                     "$fifth",                                     { "$cond": [                                         { "$eq": [ "$pos", 6 ] },                                         "$sixth",                                         false                                     ]}                                 ]}                             ]}                         ]}                     ]}                 ]             }         }     }},     { "$unwind": "$msgs" },     { "$match": { "msgs": { "$ne": false } }},     { "$group": {         "_id": "$_id",         "msgs": { "$push": "$msgs" }     }} ]) 

you top results in array, 6 entries:

{ "_id" : 123, "msgs" : [ 1, 2, 3, 4, 5, 6 ] } { "_id" : 456, "msgs" : [ 12, 13, 14, 15 ] } 

as can see here, loads of fun.

after have grouped want "pop" $first value off of stack array results. make process simplified little, in initial operation. process becomes:

  • $unwind array
  • compare values seen $eq equality match
  • $sort results "float" false unseen values top ( still retains order )
  • $group again , "pop" $first unseen value next member on stack. uses $cond operator replace "seen" values in array stack false in evaluation.

the final action $cond there make sure future iterations not adding last value of array on , on "slice" count greater array members.

that whole process needs repeated many items wish "slice". since found "first" item in initial grouping, means n-1 iterations desired slice result.

the final steps optional illustration of converting arrays result shown. conditionally pushing items or false matching position , "filtering" out false values end arrays have "six" , "five" members respectively.

so there not standard operator accommodate this, , cannot "limit" push 5 or 10 or whatever items in array. if have it, best approach.


you possibly approach mapreduce , forsake aggregation framework together. approach take ( within reasonable limits ) have in-memory hash-map on server , accumulate arrays that, while using javascript slice "limit" results:

db.messages.mapreduce(     function () {          if ( !stash.hasownproperty(this.conversation_id) ) {             stash[this.conversation_id] = [];         }          if ( stash[this.conversation_id.length < maxlen ) {             stash[this.conversation_id].push( this._id );             emit( this.conversation_id, 1 );         }      },     function(key,values) {         return 1;   // want keep keys     },     {          "scope": { "stash": {}, "maxlen": 10 },         "finalize": function(key,value) {             return { "msgs": stash[key] };                         },         "out": { "inline": 1 }     } ) 

so builds "in-memory" object matching emitted "keys" array never exceeding maximum size want fetch results. additionally not bother "emit" item when maximum stack met.

the reduce part nothing other reduce "key" , single value. in case our reducer did not called, true if 1 value existed key, finalize function takes care of mapping "stash" keys final output.

the effectiveness of varies on size of output, , javascript evaluation not fast, possibly faster processing large arrays in pipeline.


vote jira issues have "slice" operator or "limit" on "$push" , "$addtoset", both handy. hoping @ least modification can made $map operator expose "current index" value when processing. allow "slicing" , other operations.

really want code "generate" of required iterations. if answer here gets enough love and/or other time pending have in tuits, might add code demonstrate how this. reasonably long response.


code generate pipeline:

var key = "$conversation_id"; var val = "$_id"; var maxlen = 10;  var stack = []; var pipe = []; var fproj = { "$project": { "pos": { "$const": []  } } };  ( var x = 1; x <= maxlen; x++ ) {      fproj["$project"][""+x] = 1;     fproj["$project"]["pos"]["$const"].push( x );      var rec = {         "$cond": [ { "$eq": [ "$pos", x ] }, "$"+x ]     };     if ( stack.length == 0 ) {         rec["$cond"].push( false );     } else {         lval = stack.pop();         rec["$cond"].push( lval );     }      stack.push( rec );      if ( x == 1) {         pipe.push({ "$group": {            "_id": key,            "1": { "$first": val },            "msgs": { "$push": val }         }});     } else {         pipe.push({ "$unwind": "$msgs" });         var proj = {             "$project": {                 "msgs": 1             }         };          proj["$project"]["seen"] = { "$eq": [ "$"+(x-1), "$msgs" ] };          var grp = {             "$group": {                 "_id": "$_id",                 "msgs": {                     "$push": {                         "$cond": [ { "$not": "$seen" }, "$msgs", false ]                     }                 }             }         };          ( n=x; n >= 1; n-- ) {             if ( n != x )                  proj["$project"][""+n] = 1;             grp["$group"][""+n] = ( n == x ) ? { "$first": "$msgs" } : { "$first": "$"+n };         }          pipe.push( proj );         pipe.push({ "$sort": { "seen": 1 } });         pipe.push(grp);     } }  pipe.push(fproj); pipe.push({ "$unwind": "$pos" }); pipe.push({     "$group": {         "_id": "$_id",         "msgs": { "$push": stack[0] }     } }); pipe.push({ "$unwind": "$msgs" }); pipe.push({ "$match": { "msgs": { "$ne": false } }}); pipe.push({     "$group": {         "_id": "$_id",         "msgs": { "$push": "$msgs" }     } });  

that builds basic iterative approach maxlen steps $unwind $group. embedded in there details of final projections required , "nested" conditional statement. last approach taken on question:

does mongodb's $in clause guarantee order?


Comments

Popular posts from this blog

java - How to specify maven bin in eclipse maven plugin? -

single sign on - Logging into Plone site with credentials passed through HTTP -

php - Why does AJAX not process login form? -