Skip to content

Elasticsearch aggregation

ElasticsearchAggregationInput

Bases: Input

Preforms an Elasticsearch Aggregation to provide a stream of events for procesing.

Plugin name: elasticsearch

Example Configuration

.. code-block:: yaml

name: elasticsearch
conf:
  id_term: item_id
  connection_kwargs:
  index: ceda-index
  hosts: ['host1:9200','host2:9200']
  request_timeout: 60
Source code in stac_generator/plugins/inputs/elasticsearch_aggregation.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
class ElasticsearchAggregationInput(Input):
    """
    Preforms an [Elasticsearch Aggregation](https://www.elastic.co/)
    to provide a stream of events for procesing.

    **Plugin name:** ``elasticsearch``

    Example Configuration:
        .. code-block:: yaml

            name: elasticsearch
            conf:
              id_term: item_id
              connection_kwargs:
              index: ceda-index
              hosts: ['host1:9200','host2:9200']
              request_timeout: 60
    """

    config_class = ElasticsearchConf

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.id_term = kwargs["id_term"]
        self.index = kwargs["index"]

        self.search_kwargs = kwargs.get("search_kwargs")

    def run(self):
        start = datetime.now()
        total_generated = 0

        es_client = Elasticsearch(**self.conf.client_kwargs)

        body = {
            "query": self.conf.query,
            "aggs": {
                "bucket": {
                    "composite": {
                        "sources": [
                            {
                                "uri": {
                                    "terms": {"field": {self.conf.uri_term}},
                                }
                            },
                            {
                                "recipe_path": {
                                    "terms": {"field": "recipe_path.keyword"},
                                }
                            },
                        ],
                        "size": 100,
                    }
                }
            },
            "size": 0,
        }

        for extra_term in self.conf.extra_terms:
            body["bucket"]["composite"][extra_term.key] = {
                "terms": {"field": f"{extra_term.key}"},
            }

        while True:
            result = es_client.search(
                index=self.index, body=body, request_timeout=self.conf.request_timeout
            )

            aggregation = result["aggregations"]["bucket"]

            for bucket in aggregation["buckets"]:
                output = {"uri": bucket["key"]["uri"]}

                for extra_term in self.conf.extra_terms:
                    output[extra_term.output_key] = bucket["key"][extra_term.key]

                yield output
                total_generated += 1

            if "after_key" not in aggregation.keys():
                break

            body["aggs"]["bucket"]["composite"]["after"] = aggregation["after_key"]

        end = datetime.now()
        print(f"Processed {total_generated} elasticsearch records in {end-start}")

ElasticsearchConf

Bases: BaseModel

Elasticsearch config model.

Source code in stac_generator/plugins/inputs/elasticsearch_aggregation.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class ElasticsearchConf(BaseModel):
    """Elasticsearch config model."""

    index: ElasticsearchIndex = Field(
        description="Elasticsearch index to post to.",
    )
    uri_term: str = Field(
        default="uri.keyword",
        description="Term to use as uri.",
    )
    client_kwargs: dict = Field(
        default={},
        description="Elasticsearch connection kwargs.",
    )
    extra_terms: list[KeyOutputKey] = Field(
        default=[],
        description="List of extra terms.",
    )
    query: dict = Field(
        default={},
        description="Elasticsearch search query.",
    )
    request_timeout: int = Field(
        default=60,
        description="Request timeout for search.",
    )

ElasticsearchIndex

Bases: BaseModel

Elasticsearch index model.

Source code in stac_generator/plugins/inputs/elasticsearch_aggregation.py
23
24
25
26
27
28
29
30
31
32
class ElasticsearchIndex(BaseModel):
    """Elasticsearch index model."""

    name: str = Field(
        description="Name of index.",
    )
    mapping: str | dict = Field(
        default={},
        description="Index initial mapping.",
    )