複数のフィールドによるElasticSearchグループ

Question

基本的に、次のMySqlクエリに相当するESを取得しようとしています。

select gender, age_range, count(distinct profile_id) as count FROM TABLE group by age_range, gender

年齢と性別は簡単に取得できました。

{ "query": { "match_all": {} }, "facets": { "ages": { "terms": { "field": "age_range", "size": 20 } }, "gender_by_age": { "terms": { "fields": [ "age_range", "gender" ] } } }, "size": 0 }

与えるもの：

{ "ages": { "_type": "terms", "missing": 0, "total": 193961, "other": 0, "terms": [ { "term": 0, "count": 162643 }, { "term": 3, "count": 10683 }, { "term": 4, "count": 8931 }, { "term": 5, "count": 4690 }, { "term": 6, "count": 3647 }, { "term": 2, "count": 3247 }, { "term": 1, "count": 120 } ] }, "total_gender": { "_type": "terms", "missing": 0, "total": 193961, "other": 0, "terms": [ { "term": 1, "count": 94799 }, { "term": 2, "count": 62645 }, { "term": 0, "count": 36517 } ] } }

しかし今、私はこのようなものが必要です：

[breakdown_gender] => Array ( [1] => Array ( [0] => 264 [1] => 1 [2] => 6 [3] => 67 [4] => 72 [5] => 40 [6] => 23 ) [2] => Array ( [0] => 153 [2] => 2 [3] => 21 [4] => 35 [5] => 22 [6] => 11 ) )

その点に注意してください 0,1,2,3,4,5,6は年齢範囲の「マッピング」であるため、実際には数字ではなく何かを意味します。例えばGender [1]（「男性」）は、246カウントの年齢範囲[0]（「18歳未満」）に分類されます。

moliware · Accepted Answer

2つのフィールドしかないため、単一のファセットで2つのクエリを実行するのが簡単な方法です。男性の場合：

{ "query" : { "term" : { "gender" : "Male" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } } }

女性の場合：

{ "query" : { "term" : { "gender" : "Female" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } } }

または、ファセットフィルターを使用して1つのクエリで実行できます（詳細についてはこのリンクを参照）

{ "query" : { "match_all": {} }, "facets" : { "age_range_male" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Male" } } }, "age_range_female" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Female" } } } } }

更新：

ファセットが削除されようとしているため。これは集約を使用したソリューションです。

{ "query": { "match_all": {} }, "aggs": { "male": { "filter": { "term": { "gender": "Male" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } }, "female": { "filter": { "term": { "gender": "Female" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } } } }

Joe · Answer

ElasticSearchのバージョン1.0以降、新しい aggregationsAPI により、subを使用して複数のフィールドでグループ化できます。 -集約。フィールドでグループ化する場合は、field1、field2およびfield3：

{ "aggs": { "agg1": { "terms": { "field": "field1" }, "aggs": { "agg2": { "terms": { "field": "field2" }, "aggs": { "agg3": { "terms": { "field": "field3" } } } } } } } }

もちろん、これは好きなだけ多くのフィールドに適用できます。

更新：
完全を期すために、上記のクエリの出力は次のようになります。以下もpython集計クエリを生成し、結果を辞書のリストにフラット化するためのコードです。

{ "aggregations": { "agg1": { "buckets": [{ "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, { "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, ... ] } } }

次のpythonコードは、フィールドのリストを指定してグループ化を実行します。include_missing=True、一部のフィールドが欠落している値の組み合わせも含まれます（ this のおかげでElasticsearchのバージョン2.0を使用している場合は必要ありません）

def group_by(es, fields, include_missing): current_level_terms = {'terms': {'field': fields[0]}} agg_spec = {fields[0]: current_level_terms} if include_missing: current_level_missing = {'missing': {'field': fields[0]}} agg_spec[fields[0] + '_missing'] = current_level_missing for field in fields[1:]: next_level_terms = {'terms': {'field': field}} current_level_terms['aggs'] = { field: next_level_terms, } if include_missing: next_level_missing = {'missing': {'field': field}} current_level_terms['aggs'][field + '_missing'] = next_level_missing current_level_missing['aggs'] = { field: next_level_terms, field + '_missing': next_level_missing, } current_level_missing = next_level_missing current_level_terms = next_level_terms agg_result = es.search(body={'aggs': agg_spec})['aggregations'] return get_docs_from_agg_result(agg_result, fields, include_missing) def get_docs_from_agg_result(agg_result, fields, include_missing): current_field = fields[0] buckets = agg_result[current_field]['buckets'] if include_missing: buckets.append(agg_result[(current_field + '_missing')]) if len(fields) == 1: return [ { current_field: bucket.get('key'), 'doc_count': bucket['doc_count'], } for bucket in buckets if bucket['doc_count'] > 0 ] result = [] for bucket in buckets: records = get_docs_from_agg_result(bucket, fields[1:], include_missing) value = bucket.get('key') for record in records: record[current_field] = value result.extend(records) return result