https://github.com/geektime-geekbang/geektime-ELK


分词器

分词器概述

分词器包含三部分

  1. character filter: 分词之前的预处理,过滤掉HTML标签,特殊符号转换等。
  2. tokenizer: 分词
  3. token filter: 标准化,大小写,同义词,单复数转换。

分词器的类别

  • keyword: 不对输入做任何的处理,直接将输入当做一个 term 输出
  • standard: 对英文按 word 分词,对中文按照汉字分词

创建 & 使用分词器

下面为创建索引时指定类型的设置:

为索引创建了 person 类型,其中包含 user 字段

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
{
  "mappings": {
    "person": {
      "properties": {
        "user": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_max_word"
        },
      }
    }
  }
}

  • search_analyzer 是搜索词的分词器
  • analyzer 是字段文本的分词器

分词器的使用示例

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
GET /_analyze
{
  "analyzer": "keyword",
  "text": "Mastering ElasticSearch, elasticsearch in action"
}
{
  "tokens" : [
    {
      "token" : "Mastering ElasticSearch, elasticsearch in action",
      "start_offset" : 0,
      "end_offset" : 48,
      "type" : "word",
      "position" : 0
    }
  ]
}

---

GET /_analyze
{
  "analyzer": "icu_analyzer",
  "text": "他说的确实在理"
}
{
  "tokens" : [
    {
      "token" : "他",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "说的",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "确实",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "在",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "<IDEOGRAPHIC>",
      "position" : 3
    },
    {
      "token" : "理",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 4
    }
  ]
}

GET /_analyze
{
  "analyzer": "standard",
  "text": "他说的确实在理"
}

{
  "tokens" : [
    {
      "token" : "他",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "说",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "的",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "确",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "<IDEOGRAPHIC>",
      "position" : 3
    },
    {
      "token" : "实",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "<IDEOGRAPHIC>",
      "position" : 4
    },
    {
      "token" : "在",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "<IDEOGRAPHIC>",
      "position" : 5
    },
    {
      "token" : "理",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 6
    }
  ]
}

聚合分析

  • Bucket Aggregation

将一些列满足特定条件的文档聚合成一个桶

  • Metric Aggreation

一些数学运算,可以对文档的字段进行统计分析

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
# 根据电影的年份进行分组
GET movies/_search
{
  "size": 0,
  "aggs": {
    "movie_year": {
      "terms": {
        "field": "year"
      }
    }
  }
}

# 将电影根据年份分组后,再来求每组内的最大年份和最小年份(由于text 类型不能比较,只好求个最大年份)
GET movies/_search
{
  "size": 0,
  "aggs": {
    "movie_year": {
      "terms": {
        "field": "year"
      },
      "aggs": {
        "max_year": {
          "max": {
            "field": "year"
          }
        },
        "min_year": {
          "min": {
            "field": "year"
          }
        }
      }
    }
  }
}

# 根据目的地将航班进行分组
GET kibana_sample_data_flights/_search
{
  "size": 0,
  "aggs": {
    "flight_dest": {
      "terms": {
        "field": "DestCountry",
        "size": 10
      }
    }
  }
}

# 根据目的地将航班进行分组,且在组内输出票价
GET kibana_sample_data_flights/_search
{
  "size": 0,
  "aggs": {
    "flight_dest": {
      "terms": {
        "field": "DestCountry",
        "size": 10
      },
      "aggs": {
        "avg_price": {
          "avg": {
            "field": "AvgTicketPrice"
          }
        },
        "max_price": {
          "max": {
            "field": "AvgTicketPrice"
          }
        },
        "min_price": {
          "min": {
            "field": "AvgTicketPrice"
          }
        }
      }
    }
  }
}

# 根据目的地将航班进行分桶,再根据目的地天气进行分桶,输出每个组内平均票价的统计值
GET kibana_sample_data_flights/_search
{
  "size": 0,
  "aggs": {
    "flight_dest": {
      "terms":{
        "field": "DestCountry"
      },
      "aggs": {
        # stats 是 Metric 聚合
        "stats_price": {
          "stats": {
            "field": "AvgTicketPrice"
          }
        },
        "weather": {
          "terms": {
            "field": "DestWeather",
            "size": 10
          }
        }
      }
    }
  }
}

Term 查询

  • Term Query / Range Query / Exists Query / Prefix Query / Wildcard Query

Term 是表达语意的最小单位。搜索和利用统计语言模型进行自然语言处理都需要处理 Term。 Term 查询不做分词,会将输入作为一个整体,在倒排索引中查找准确的词项,并且利用相关度算分公式为每个包含该词的文档进行相关度算分。

ES 默认为每个 text 字段都新建一个 keyword 字段,可以通过 keyword 进行精确查询。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
POST /products/_search
{
    "query": {
        "term": {
            "productID.keyword": {
                "value": "XHDK-A-1293-#fJ3"
            }
        }
    }
}
  • 通过 constant_score 可以跳过算分步骤

全文查询

  • Match Query / Match Phrase Query / Query string Query

  • 搜索时会分词,查询字符串先找到一个合适的分词器,生成一个供查询的词项列表, 然后针对每个词项在倒排索引中进行查询。

  • Match 查询的过程

结构化搜索

日期,布尔类型和数字都是结构化的,结构化的数据可以使用 Term 查询。搜索结果只有"是"和"否"两个值。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
POST /products/_bulk
{"index": {"_id": 1}}
{"price": 10, "avaliable": true, "date": "2019-01-01", "productID": "XHDK-A-1293-#f13"}
{"index": {"_id": 2}}
{"price": 20, "avaliable": true, "date": "2020-01-01", "productID": "KDKE-A-1293-#f13"}
{"index": {"_id": 3}}
{"price": 30, "avaliable": true, "productID": "JODL-A-9947-#f13"}
{"index": {"_id": 4}}
{"price": 30, "avaliable": false, "productID": "QQPX-R-3956-#f13"}

GET products/_mapping

# 查询某个布尔值字段
POST products/_search
{
  "profile": "true",
  "explain": true,
  "query": {
    "term": { "avaliable": true }
  }
}

# 跳过算分步骤的布尔查询
POST products/_search
{
  "profile": "true",
  "explain": true,
  "query": {
    "constant_score": {
      "filter": {
        "term": { "avaliable": true }
      }
    }
  }
}

# 数字 Range 查询
GET products/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "price": { "gte": 20, "lte": 30 }
        }
      }
    }
  }
}

# 日期 range 查询
# now-1y 表示一年前的今天
GET products/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "date": { "gte": "now-1y" }
        }
      }
    }
  }
}

# 通过 exists 来处理空值
POST products/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "exists": { "field": "date" }
      }
    }
  }
}

# 处理多值字段,Term 查询是包含,而不是等于
POST /movies/_bulk
{"index": {"_id": 1}}
{"title": "Father of the bridge II", "year": 1995, "genre": "Comedy"}
{"index": {"_id": 2}}
{"title": "Dave", "year": 1993, "genre": ["Comedy", "Romance"]}

GET /movies/_mapping

POST /movies/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "term": { "genre.keyword": "Comedy" }
      }
    }
  }
}

搜索的相关性算分

相关性描述的是搜索语句和文档的匹配程度。ES 5之前,默认的相关性算分采用 TF-IDF,现在采用 BM 25。

  • 词频(Term Frequency): 检索词在一篇文档中出现的频率,检索词出现的次数 / 文档总字数
  • DF: 检索词在所有文档中出现的频率
  • 逆文档频率(Inverse Document Frequency): log(全部文档数/检索词出现过的文档总数)
  • TF-IDF 的本质是将 TF 求和变成了加权求和。TF(Term1) * IDF(Term1) + TF(Term2) * IDF(Term2) ...

Lucene 中的 TF-IDF 评分公式:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
PUT testscore/_bulk
{"index": {"_id": 1}}
{"content": "we use ElasticSearch to power the search"}
{"index": {"_id": 2}}
{"content": "we like ElasticSearch"}
{"index": {"_id": 3}}
{"content": "The scoring of documents is calculated by the scoring formula"}
{"index": {"_id": 4}}
{"content": "you known, for search"}

# 通过打开 explain,可以看到相关性分的计算过程
POST testscore/_search
{
  "explain": true,
  "query": {
    "match": { "content": "elasticsearch" }
  }
}

# 通过 Boosting 控制相关度
# boost > 1 时,打分的相关度相对性提升
# 0 < boost < 1 时,打分的权重相对性降低
# boost < 0 时,贡献负分
# 这里的 negative_boost 就是匹配 "like" term 的文档的相关性分会在最后乘 0.2
POST testscore/_search
{
  "explain": true,
  "query": {
    "boosting": {
      "positive": {
        "term": { "content": "elasticsearch" }
      },
      "negative": {
        "term": { "content": "like" }
      },
      "negative_boost": 0.2
    }
  }
}

多字符串多字段查询

在 ES 中,有 Query 和 Filter 两种不同的 Context:

  • Query Context: 相关性算分
  • Filter Context: 不需要算法(Yes or No),可以利用 Cache,获得更好的性能

布尔查询

布尔查询是一个或多个查询子句的组合。

相关性并不是全文本检索的独有特性,也适用于 Yes | No 的子句,匹配的子句越多,相关性越高。如果多条查询语句被合并成一条复合查询语句,例如 bool 查询,则每个查询子句计算出的评分会被合并到总的相关性评分中。

子句 说明
must 必须匹配,贡献算分
should 选择性匹配,贡献算分
must_not Filter Context,查询子句,必须不能匹配
filter Filter Context,必须匹配,但不贡献算分
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
POST /products/_bulk
{"index": {"_id": 1}}
{"price": 10, "avaliable": true, "date": "2019-01-01", "productID": "XHDK-A-1293-#f13"}
{"index": {"_id": 2}}
{"price": 20, "avaliable": true, "date": "2020-01-01", "productID": "KDKE-A-1293-#f13"}
{"index": {"_id": 3}}
{"price": 30, "avaliable": true, "productID": "JODL-A-9947-#f13"}
{"index": {"_id": 4}}
{"price": 30, "avaliable": false, "productID": "QQPX-R-3956-#f13"}


# 布尔查询示例,多个条件之间是 and 的关系。
# 每个子句算的分会汇总起来,变成布尔语句的分
POST /products/_search
{
  "query": {
    "bool": {
      "must": {
        "term": {"price": "30"}
      },
      "filter": {
        "term": {"avaliable": true}
      },
      "must_not": [
        {"range": { "price": {"lte": 10} }}
      ],
      "should": [
        {"term": {"productID.keyword": "JODL-A-9947-#f13"}},
        {"term": {"productID.keyword": "XHDK-A-1293-#f13"}}
      ]
    }
  }
}

# filter 和 must_not 都是 Filter Context,不会进行相关性算分,可以看到算出的分都是0
POST /products/_search
{
  "query": {
    "bool": {
      "filter": {
        "term": {"avaliable": true}
      },
      "must_not": [
        {"range": {"price": {"lte": 10}}}
      ]
    }
  }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
POST /blogs/_bulk
{"index": {"_id": 1}}
{"title": "Apple iPad", "content": "Apple iPad,Apple iPad"}
{"index": {"_id": 2}}
{"title": "Apple iPad,Apple iPad", "content": "Apple iPad"}

# 通过 boost 控制字段的分数,调整字段之间的优先级
# 当 title 的 boost 较大时,title 长的文档(2)就会排在前面,反之同理
POST /blogs/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title":{
              "query": "apple,ipad",
              "boost": 3
            }}
        },
        {
          "match": {
            "content":{
              "query": "apple,ipad",
              "boost": 2
            }}
        }
      ]
    }
  }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
POST /news/_bulk
{"index": {"_id": 1}}
{"content": "Apple Mac"}
{"index": {"_id": 2}}
{"content": "Apple iPad"}
{"index": {"_id": 3}}
{"content": "Apple employee like Apple Pie and Apple Juice"}


# 查询结果中包含了食物信息(3),我们并不想要这一条
POST /news/_search
{
  "query": {
    "bool": {
      "must": {
        "match": {"content": "apple"}
      }
    }
  }
}

# 通过布尔查询将 3 去掉
# 布尔查询多个子句之间是 and 的关系
POST /news/_search
{
  "query": {
    "bool": {
      "must": {
        "match": {"content": "apple"}
      },
      "must_not": {
        "match": {"content": "pie"}
      }
    }
  }
}

# 通过 boosting 降低3的权重,让它排到后面
POST /news/_search
{
  "query": {
    "boosting": {
      "positive": {
        "match": {"content": "apple"}
      },
      "negative": {
        "match": {"content": "pie"}
      },
      "negative_boost": 0.5
    }
  }
}

单字符串多字段查询

场景归纳

最佳字段 (Best Fields)

当字段之间相互竞争,又相互管理。例如 title 和 body 这样的字段

多数字段 (Most fields)

处理英文内容是:一种常见的手段是,在主字段( English Analyzer)上抽取词干,加入同义词,以匹配更多的文档。相同的文本,加入子字段(Standard Analyzer),以提供更加精确的匹配。其他字段作为匹配文档相关度的信号。匹配字段越多则越好。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
DELETE /titles

# 这里有两个字段 title, 和 title.std
PUT /titles
{
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "english",
        "fields": {"std": {"type": "text", "analyzer": "standard"}}
      }
    }
  }
}

POST titles/_bulk
{"index":{"_id": 1}}
{"title": "My dog barks"}
{"index":{"_id": 2}}
{"title": "I see a lot of barking dogs on the road"}

# 这里会计算两个字段的分,这样更加符合的文档2就会在前面了
GET titles/_search
{
  "query": {
    "multi_match": {
      "query": "barking dogs",
      "type": "most_fields",
      "fields": ["title", "title.std"]
    }
  }
}

混合字段 (Cross Field)

对于某些实体,例如人名,地址,图书信息。需要在多个字段中确定信息,单个字段只能作为整体的一部分。希望在任何列出的字段中找到尽可能多的词。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 跨字段搜索

PUT address/_doc/1
{
  "street": "5 Poland Street",
  "city": "London",
  "country": "United Kingdom",
  "postcode": "W1V 3DG"
}

# 用一个关键字查询多个字段
POST address/_search
{
  "query": {
    "multi_match": {
      "query": "Poland Street W1V",
      "type": "cross_fields",
      "operator": "and",
      "fields": ["street", "city", "country", "postcode"]
    }
  }
}

Disjunction Max Query

将任何与任一查询匹配的文档作为结果返回。采用字段上最匹配的评分作为最终评分返回。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
PUT /blogs/_bulk
{"index": {"_id": 1}}
{"title": "Quick brown rabbits", "body": "Brown rabbitq are commonly seen."}
{"index": {"_id": 2}}
{"title": "Keeping pets healthy", "body": "My quick brown fox eats rabbits on a regular basis."}

# 查询算分过程
# 查询 should 语句中的两个查询
# 加和两个查询的评分
# 乘以匹配语句的总数
# 除以所有语句的总数

# 这里的文档1的两个子句的算分平均分更高,所以1在前面
POST /blogs/_search
{
  "explain": true,
  "query": {
    "bool": {
      "should": [
        {"match": {"title": "Brown fox"}},
        {"match": {"body": "Brown fox"}}
      ]
    }
  }
}

# 这里使用了 dis_max,返回的最终分数算分是根据最高的那个子句的分数得出的
POST /blogs/_search
{
  "query": {
    "dis_max": {
      "queries": [
        {"match": {"title": "Brown fox"}},
        {"match": {"body": "Brown fox"}}
      ]
    }
  }
}

# 获得评分最高的语句后,tie_breaker 将与其他语句的评分相乘,得到最终评分
POST /blogs/_search
{
  "query": {
    "dis_max": {
      "queries": [
        {"match": {"title": "Quick pets"}},
        {"match": {"body": "Quick pets"}}
      ],
      "tie_breaker": 0.7
    }
  }
}

多语言及中文分词与检索

Docker ES 集群的配置

  • 使用拼音分词器进行分词
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
PUT /artists/
{
  "settings": {
    "analysis": {
      "analyzer": {
        "user_name_analyzer": {
          "tokenizer": "whitespace",
          "filter": "pinyin_first_letter_and_full_pinyin_filter"
        }
      },
      "filter": {
        "pinyin_first_letter_and_full_pinyin_filter": {
          "type": "pinyin",
          "keep_first_letter": true,
          "keep_full_pinyin": false,
          "keep_none_chinese": true,
          "keep_original": false,
          "limit_first_letter_length": 16,
          "lowercase": true,
          "trim_whitespace": true,
          "keep_none_chinese_in_first_letter": true
        }
      }
    }
  }
}

GET /artists/_analyze
{
  "text": ["刘德华 张学友 郭富城 黎明 四大天王"],
  "analyzer": "user_name_analyzer"
}
{
  "tokens" : [
    {
      "token" : "ldh",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "zxy",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "gfc",
      "start_offset" : 8,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "lm",
      "start_offset" : 12,
      "end_offset" : 14,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "sdtw",
      "start_offset" : 15,
      "end_offset" : 19,
      "type" : "word",
      "position" : 4
    }
  ]
}

使用 Search-Template 和 Index-Alias 查询

Search Template 可以定义一个查询,将 查询优化使用查询 这两个工作分开, SearchTemplate 的语法参考官方文档

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 定义搜索模板,通过 POST 也可以更新这个模板
POST _scripts/tmdb
{
  "script": {
    "lang": "mustache",
    "source": {
      "_source": [
        "title", "overview"
      ]
    },
    "size": 20,
    "query": {
      "multi_match": {
        "query": "{{q}}",
        "fields": ["title", "overview"]
      }
    }
  }
}

# 使用搜索模板
POST tmdb/_search/template
{
  "id": "tmdb",
  "params": {
    "q": "basketball with cartoon aliens"
  }
}

Index Alias,为索引创建一个别名,可以实现0停机运维。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 为索引创建一个别名,可以将 tmdb-latest 当做 tmdb 来使用
POST _aliases
{
  "actions": [
    {
      "add": {
        "index": "tmdb",
        "alias": "tmdb-latest"
      }
    }
  ]
}

# 为索引创建别名,同时附带过滤条件,tmdb-latest-highrate 这个索引和 tmdb 索引的结构一样,但是只有 2191 个结果。(tmdb 有 3051 个结果)
POST _aliases
{
  "actions": [
    {
      "add": {
        "index": "tmdb",
        "alias": "tmdb-latest-highrate",
        "filter": {
          "range": {
            "vote_average": {
              "gte": 6
            }
          }
        }
      }
    }
  ]
}

对象及嵌套对象(Nested)

包含嵌套对象的文档

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# 创建博客文档

PUT /blog
{
  "mappings": {
    "properties": {
      "content": {
        "type": "text"
      },
      "time": {
        "type": "date"
      },
      "user": {
        "properties": {
          "city": {
            "type": "text"
          },
          "userid": {
            "type": "long"
          },
          "username": {
            "type": "keyword"
          }
        }
      }
    }
  }
}

# 向博客文档中插入数据
PUT blog/_doc/1
{
  "content": "I Like ElasticSearch",
  "time": "2020-05-10T21:45:12",
  "user": {
    "userid": 1,
    "username": "Jack",
    "city": "Shanghai"
  }
}

# 查询 Blog 信息
POST blog/_search
{
  "query": {
    "bool": {
      "must": [
        {"match": {"content": "ElasticSearch"}},
        {"match": {"user.username": "Jack"}}
      ]
    }
  }
}

包含对象数组的文档

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 创建电影的文档

PUT my_movies
{
  "mappings": {
    "properties": {
      "actors": {
        "properties": {
          "first_name": {
            "type": "keyword"
          },
          "last_name": {
            "type": "keyword"
          }
        }
      },
      "title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}

# 写入一条电影信息
POST my_movies/_doc/1
{
  "title": "Speed",
  "actors": [
    {"first_name": "Keanu", "last_name": "Reeves"},
    {"first_name": "Dennis", "last_name": "Hopper"}
  ]
}

# 由于 ES 会将数组的每个字段都扁平存储,所以以下查询能查到结果
# "title": "speed"
# "actor.first_name": ["Keanu", "Dennis"],
# "actor.last_name": ["Reeves", "Hoppers"]
GET my_movies/_search
{
  "query": {
    "bool": {
      "must": [
        {"match": {"actors.first_name": "Keanu"}},
        {"match": {"actors.last_name": "Hopper"}}
      ]
    }
  }
}

Nested 数据类型

Nested 数据类型允许对象数组中的对象被独立索引,在 ES 内部,Nested 文档会被保存在两个 Lucene 文档中,在查询时做 Join 处理。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
# 创建电影的文档

PUT my_movies
{
  "mappings": {
    "properties": {
      "actors": {
        # 注意这里声明了类型为 nested
        "type": "nested",
        "properties": {
          "first_name": {
            "type": "keyword"
          },
          "last_name": {
            "type": "keyword"
          }
        }
      },
      "title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}

# 写入一条电影信息
POST my_movies/_doc/1
{
  "title": "Speed",
  "actors": [
    {"first_name": "Keanu", "last_name": "Reeves"},
    {"first_name": "Dennis", "last_name": "Hopper"}
  ]
}

# 通过一个嵌套对象字段进行查询
GET my_movies/_search
{
  "query": {
    # 声明查询的字段是嵌套字段
    "nested": {
      # 指出嵌套字段的位置
      "path": "actors",
      # 在嵌套字段内的查询方式
      "query": {
        "bool": {
          "should": [
            {"match": {"actors.first_name": "Keanu"}},
            {"match": {"actors.last_name": "Hopper"}}
          ]
        }
      }
      # 另一种简单的查询
      # "query": {
      #   "match": {"actors.first_name": "Keanu"}
      # }
    }
  }
}

# 使用 Nested 查询
GET my_movies/_search
{
  "query":{
    "nested": {
      "path": "actors",
      "query": {
        "bool": {
          "must": [
            {"match": {"actors.first_name": "Keanu"}},
            {"match": {"actors.last_name": "Reeves"}}
          ]
        }
      }
    }
  }
}

# 对嵌套对象的字段进行聚合
GET my_movies/_search
{
  "size": 0,
  "aggs": {
    "actors": {
      "nested": {
        "path": "actors"
      },
      "aggs": {
        "acotr_name": {
          "terms": {
            "field": "actors.first_name",
            "size": 10
          }
        }
      }
    }
  }
}