Skip to content

Xml

XMLExtract

Bases: ExtractionMethod

Processes XML documents to extract metadata

Method name: xml

Example configuration

.. code-block:: yaml

- method: xml
  inputs:
    properties:
      - name: start_datetime
        key: './/gml:beginPosition'
        attribute: start

noqa: W605

Source code in extraction_methods/plugins/xml.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class XMLExtract(ExtractionMethod):
    """
    Processes XML documents to extract metadata

    **Method name:** ``xml``

    Example configuration:
        .. code-block:: yaml

            - method: xml
              inputs:
                properties:
                  - name: start_datetime
                    key: './/gml:beginPosition'
                    attribute: start

    # noqa: W605
    """

    input_class = XMLInput

    def run(self, body: dict[str, Any]) -> dict[str, Any]:

        # Extract the keys
        try:

            if os.path.isfile(self.input.input_term):
                xml_file = etree.parse(self.input.input_term)

            else:
                xml_file = etree.XML(self.input.input_term.encode("ascii", "ignore"))

        except (etree.ParseError, FileNotFoundError, TypeError):
            return body

        output: dict[str, list[str]] = defaultdict(list)

        for prop in self.input.properties:
            values = xml_file.findall(
                prop.key,
                self.input.namespaces,
            )

            for value in values:
                if value is not None:

                    if prop.attribute:
                        v = value.get(prop.attribute, "")

                    else:
                        v = value.text

                    if v and v not in output[prop.output_key]:
                        output[prop.output_key].append(v.strip())

            if output[prop.output_key]:
                body[prop.output_key] = (
                    output[prop.output_key][0]
                    if len(output[prop.output_key]) == 1
                    else output[prop.output_key]
                )

        return body

XMLInput

Bases: Input

Model for XML Input.

Parameters:

Name Type Description Default
input_term str

Term for method to run on.

'$uri'
properties list[XMLProperty]

List of properties to retrieve from the document.

required
namespaces dict[str, str]

Map of namespaces.

required
Source code in extraction_methods/plugins/xml.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class XMLInput(Input):
    """
    Model for XML Input.
    """

    input_term: str = Field(
        default="$uri",
        description="Term for method to run on.",
    )
    # template: str = Field(
    #     description="Template to follow.",
    # )
    properties: list[XMLProperty] = Field(
        description="List of properties to retrieve from the document.",
    )
    # filter_expr: str = Field(
    #     description="Regex to match against files to limit the attempts to known files.",
    # )
    namespaces: dict[str, str] = Field(
        description="Map of namespaces.",
    )

XMLProperty

Bases: KeyOutputKey

Model for XML property.

Parameters:

Name Type Description Default
attribute str

Attribute of the XML property.

''
Source code in extraction_methods/plugins/xml.py
27
28
29
30
31
32
33
34
35
36
class XMLProperty(KeyOutputKey):
    """
    Model for XML property.

    """

    attribute: str = Field(
        default="",
        description="Attribute of the XML property.",
    )