diff --git a/.gitignore b/.gitignore
index 23ccece1a..7290b93c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ vendor/
generated/
.DS_Store
*.swp
+*.pyc
diff --git a/custom_documentation/src/documentation_overrides.yaml b/custom_documentation/src/documentation_overrides.yaml
new file mode 100644
index 000000000..cd444099d
--- /dev/null
+++ b/custom_documentation/src/documentation_overrides.yaml
@@ -0,0 +1,211 @@
+- name: Endpoint.policy.applied.artifacts.global.channel
+ default:
+ description: The channel of the artifact.
+ example: stable
+ type: keyword
+ os:
+ linux:
+ description: The channel of the linux artifact.
+ windows:
+ description: The channel of the windows artifact.
+ macos:
+ description: The channel of the macos artifact.
+ # event:
+ # linux_malicious_behavior_alert:
+ # description: The channel of the artifact for linux malicious behavior alert.
+ # example: stable
+
+- name: agent.type
+ default:
+ example: endpoint
+
+- name: Endpoint.policy.applied.artifacts.global.identifiers.name
+ default:
+ example: global-configuration-v1
+
+- name: Endpoint.policy.applied.artifacts.global.snapshot
+ default:
+ example: "latest"
+
+- name: Endpoint.policy.applied.artifacts.user.identifiers.name
+ os:
+ linux:
+ example: endpoint-trustlist-linux-v1
+ windows:
+ example: endpoint-trustlist-windows-v1
+ macos:
+ example: endpoint-trustlist-macos-v1
+
+- name: Endpoint.policy.applied.artifacts.user.version
+ default:
+ example: "1.0.0"
+
+- name: agent.build.original
+ default:
+ example: "version: 9.1.0, compiled: Mon Jun 2 15:00:00 2025, branch: main, commit: 3fd26249705c5a467960870702589686ef04da43"
+
+- name: data_stream.dataset
+ default:
+ example: endpoint.alerts
+
+- name: event.action
+ default:
+ description: |-
+ Possible values for Endpoint include:
+ - elastic_endpoint_telemetry
+ - endpoint_metadata
+ - endpoint_policy_response
+ - endpoint_metrics
+ - endpoint_heartbeat
+ - malicious_file
+ - endpoint_unquarantine
+ - lookup_result
+ - lookup_requested
+ - creation
+ - deletion
+ - demand
+ - clone
+ - link
+ - exchange
+ - execution
+ - modification
+ - open
+ - query
+ - save
+ - overwrite
+ - rename
+ - extended_attributes_delete
+ - mount
+ - unknown
+ - load
+ - connection_accepted
+ - connection_attempted
+ - disconnect_received
+ - http_request
+ - udp_datagram_outgoing
+ - udp_datagram_incoming
+ - icmp_outgoing
+ - icmp_incoming
+ - already_running
+ - fork
+ - end
+ - exec
+ - gid_change
+ - start
+ - session_id_change
+ - uid_change
+ - remote_thread
+ - process_open
+ - text_output
+ - memfd_create
+ - shmget
+ - ptrace
+ - load_module
+ - log_on
+ - log_off
+ - workstation_locked
+ - workstation_unlocked
+ - ssh_log_on
+ - rdp_log_on
+ - service-installed
+ - scheduled-task-created
+ - scheduled-task-updated
+ - added-user-account
+ - group-membership-enumerated
+ - user-member-enumerated
+ - token-right-adjusted
+ - network-share-object-added
+ - network-share-object-access-checked
+ - vault-credentials-were-read
+ - gatekeeper_override
+ - mbr-overwrite
+ - files-encrypted
+ - canary-activity
+ - rule_detection
+ - rule_prevention
+ - api
+ - launch_daemon
+ - mount
+ - unmount
+
+- name: event.category
+ default:
+ type: array of keyword
+ example: '["malware", "intrusion_detection"]'
+ description: |-
+ Possible values for Endpoint include:
+ - authentication
+ - configuration
+ - driver
+ - file
+ - host
+ - iam
+ - intrusion_detection
+ - library
+ - malware
+ - network
+ - process
+ - registry
+ - session
+ - rule
+ - credential_hardening
+ - api
+ - volume_device
+ - security
+
+- name: event.dataset
+ default:
+ example: endpoint.alerts
+
+- name: event.module
+ default:
+ example: endpoint
+ description: |-
+ The module for Endpoint is always `endpoint`
+
+- name: event.risk_score
+ default:
+ example: "99"
+ description: Endpoint risk score uses a scale of 0 to 100, where 100 is the highest risk.
+
+- name: event.severity
+ default:
+ example: "73"
+ description: Endpoint severity uses a scale of 0 to 100, where 100 is the highest risk.
+
+- name: event.type
+ default:
+ example: '["info", "allowed"]'
+ type: array of keyword
+ description: |-
+ Possible values for Endpoint include:
+ - allowed
+ - change
+ - creation
+ - deletion
+ - denied
+ - end
+ - info
+ - protocol
+ - start
+ - access
+ - admin
+ - user
+ - group
+
+- name: event.kind
+ default:
+ description: |-
+ Possible values for Endpoint include:
+ - alert
+ - event
+ - metric
+ - state
+
+- name: event.outcome
+ default:
+ description: |-
+ Possible values for Endpoint include:
+ - success
+ - failure
+ - unknown
diff --git a/scripts/generate-docs/pydocgen/Readme.md b/scripts/generate-docs/pydocgen/Readme.md
new file mode 100644
index 000000000..2f5461b68
--- /dev/null
+++ b/scripts/generate-docs/pydocgen/Readme.md
@@ -0,0 +1,54 @@
+# Custom Documentation Generator
+
+## Description
+
+This module generates documentation for the custom endpoint fields defined in [custom_documentation](../../../custom_documentation/)
+
+### Background
+
+The fields defined in [custom_documentation](../../../custom_documentation/) do not have descriptions. They are simply the possible fields
+of an event, including all the custom fields Endpoint uses but are not mapped.
+
+The fields defined in [package](../../../package/) are the fields that are mapped into Kibana. These fields have descriptions and documentation.
+
+
+### Implementation
+
+This python module generates markdown for all of the fields in [custom_documentation](../../../custom_documentation/) by taking the following steps
+
+1. Parses all of the mapped fields defined in [package](../../../package/), collecting descriptions, examples, and other metadata
+
+2. Parses any override fields defined in [documentation_overrides.yaml](../../../custom_documentation/src/documentation_overrides.yaml)
+ - overrides can be set for any field. They can be set at the event level, the os level, or a default override that applies to all
+ instances of that field.
+ - See [documentation_overrides.yaml](../../../custom_documentation/src/documentation_overrides.yaml) for the format
+ - If overrides are updated, the documentation must be regenerated
+
+3. Puts all of that data into an sqlite database
+
+4. Parses all of the endpoint fields defined in [custom_documentation](../../../custom_documentation/)
+
+5. Iterates over the custom_documentation data, filling out descriptions and examples pulled from the database that was just created.
+
+### Example Usage
+`python -m pydocgen --output-dir /path/to/output`
+
+#### Help statement
+```
+usage: __main__.py [-h] [--database DATABASE] [--no-cache] [--output-dir OUTPUT_DIR] [-v] [-l {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--csv CSV]
+
+Create markdown documentation for the fields defined in custom_documentation
+
+options:
+ -h, --help show this help message and exit
+ --database DATABASE path to the database
+ --no-cache do not use cached database if it exists, always regenerate the database
+ --output-dir OUTPUT_DIR
+ output directory for markdown documentation
+ -v, --verbose Force maximum verbosity (DEBUG level + detailed output)
+ -l {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+ Set logging verbosity level
+ --csv CSV Path to CSV file for missing documentation fields (optional)
+
+Example usage: python -m pydocgen --output-dir /path/to/output
+```
diff --git a/scripts/generate-docs/pydocgen/__init__.py b/scripts/generate-docs/pydocgen/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/generate-docs/pydocgen/__main__.py b/scripts/generate-docs/pydocgen/__main__.py
new file mode 100644
index 000000000..74176ede5
--- /dev/null
+++ b/scripts/generate-docs/pydocgen/__main__.py
@@ -0,0 +1,110 @@
+import argparse
+import logging
+from logging import config
+import pathlib
+import traceback
+import sys
+import tempfile
+
+from .markdown import generate_custom_documentation_markdown
+
+from .models.custom_documentation import DocumentationOverrideMap
+
+from typing import Literal
+
+
+def configure_logging(
+ log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+ verbose: bool
+) -> None:
+ """Configures the logging system with specified level and verbosity.
+
+ Args:
+ log_level: String representation of logging level (DEBUG, INFO, etc.)
+ verbose: Boolean flag to force maximum verbosity
+ """
+ level = getattr(logging, log_level)
+
+ # If verbose is specified, override to DEBUG level
+ if verbose:
+ level = logging.DEBUG
+
+ # Basic config with both handlers
+ logging.basicConfig(
+ level=level,
+ format="%(asctime)s - %(levelname)-8s %(message)s",
+ datefmt="%Y-%m-%d %H:%M:%S",
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Create markdown documentation for the fields defined in custom_documentation",
+ epilog="Example usage: python -m pydocgen --output-dir /path/to/output",
+ )
+
+ parser.add_argument(
+ "--database",
+ default=pathlib.Path(tempfile.gettempdir()) / "generate-docs.sqlite",
+ type=pathlib.Path,
+ help="path to the database",
+ )
+
+ parser.add_argument(
+ "--no-cache",
+ action="store_true",
+ help="do not use cached database if it exists, always regenerate the database",
+ )
+
+ parser.add_argument(
+ "--output-dir",
+ default=pathlib.Path.cwd().resolve() / "output",
+ type=pathlib.Path,
+ help="output directory for markdown documentation",
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ help="Force maximum verbosity (DEBUG level + detailed output)",
+ )
+
+ parser.add_argument(
+ "-l",
+ "--log-level",
+ type=str.upper,
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+ default="INFO",
+ help="Set logging verbosity level",
+ )
+
+ parser.add_argument(
+ "--overrides",
+ type=pathlib.Path,
+ default=pathlib.Path.cwd().resolve() / "custom_documentation" / "src" / "documentation_overrides.yaml",
+ )
+
+ parser.add_argument(
+ "--csv",
+ type=pathlib.Path,
+ default=None,
+ help="Path to CSV file for missing documentation fields (optional)",
+ )
+
+ args = parser.parse_args()
+
+ configure_logging(args.log_level, args.verbose)
+
+ if args.no_cache and args.database.exists():
+ logging.info(f"Removing existing database {args.database} since --no-cache was specified")
+ args.database.unlink()
+
+ generate_custom_documentation_markdown(args.database, args.output_dir, args.csv)
+ logging.info(f"Generated markdown documentation to {args.output_dir}")
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as e:
+ traceback.print_exc()
+ sys.exit(1)
diff --git a/scripts/generate-docs/pydocgen/database.py b/scripts/generate-docs/pydocgen/database.py
new file mode 100644
index 000000000..84c084cd5
--- /dev/null
+++ b/scripts/generate-docs/pydocgen/database.py
@@ -0,0 +1,326 @@
+import pathlib
+import logging
+
+from sqlmodel import SQLModel, Field, create_engine, Session, select, Relationship, and_
+from sqlalchemy import Engine, Column, JSON
+
+from .models.custom_documentation import DocumentationOverrideMap, OsNameList
+from .models.packages import Package, PackageList
+
+from typing import Optional, Literal, TypeAlias
+
+
+
+#
+# These models represent the database tables for mapped fields
+#
+class PackageReference(SQLModel, table=True):
+ id: int = Field(default=None, primary_key=True)
+ package_data: str = Field(default="{}", sa_column=Column(JSON))
+
+
+class PackageField(SQLModel, table=True):
+ """
+ PackageField represents a specific field as defined in package/endpoint/datastream/{type}/fields/fields.yml
+ each in fields.yml has a name and description, this class holds the name, description, and reference to the parent package.
+ These fields will be used to provide descriptions for the fields in the custom documentation.
+
+ Note: this is the database table definition for the Package class defined in models/packages.py
+
+ Args:
+ SQLModel: this is a SQLModel class (database table)
+ table: Defaults to True.
+
+ Raises:
+ ValueError: _description_
+
+ Returns:
+ _description_
+ """
+ id: Optional[int] = Field(default=None, primary_key=True)
+ name: str
+ description: Optional[str] = None
+ example: Optional[str] = None
+ type: Optional[str] = None
+ package_reference_id: Optional[int] = Field(foreign_key="packagereference.id")
+ package_reference: Optional[PackageReference] = Relationship()
+
+ @property
+ def package(self) -> Package:
+ if not self.package_reference:
+ raise ValueError(f"PackageReference is not set for PackageField {self}")
+ return Package.model_validate_json(self.package_reference.package_data)
+
+
+#
+# These models reprensent the database tables for overrides
+#
+class OverrideField(SQLModel, table=True):
+ id: int = Field(default=None, primary_key=True)
+ description: Optional[str] = None
+ example: Optional[str] = None
+ type: Optional[str] = None
+
+
+class OverrideRelationship(SQLModel, table=True):
+ id: int = Field(default=None, primary_key=True)
+ name: str
+ event: Optional[str] = None
+ os: Optional[str] = None
+ default: bool = False
+ override_id: int = Field(foreign_key="overridefield.id")
+ override: OverrideField = Relationship(sa_relationship_kwargs={"lazy": "joined"})
+
+
+def populate_overrides(session: Session):
+ dom = DocumentationOverrideMap.from_yaml()
+ for name, mapping in dom.items():
+ if mapping.os:
+ for os, override in mapping.os.items():
+ record = OverrideField(
+ description=override.description,
+ example=override.example,
+ type=override.type,
+ )
+ session.add(record)
+ session.flush()
+
+ related_record = OverrideRelationship(
+ name=name, os=os, override_id=record.id
+ )
+ session.add(related_record)
+
+ if mapping.event:
+ for event, override in mapping.event.items():
+
+ record = OverrideField(
+ description=override.description,
+ example=override.example,
+ type=override.type,
+ )
+ session.add(record)
+ session.flush()
+
+ related_record = OverrideRelationship(
+ name=name, event=event, override_id=record.id
+ )
+ session.add(related_record)
+
+ if mapping.default:
+ record = OverrideField(
+ description=mapping.default.description,
+ example=mapping.default.example,
+ type=mapping.default.type,
+ )
+ session.add(record)
+ session.flush()
+
+ related_record = OverrideRelationship(
+ name=name, default=True, override_id=record.id
+ )
+ session.add(related_record)
+
+ session.commit()
+
+
+def populate_packages_fields(session: Session):
+ """
+ populate_packages_fields populates the package fields in the database
+
+ Args:
+ session: database session
+ """
+
+ def add_to_db(field: PackageField, session: Session):
+ existing_field = session.exec(
+ select(PackageField).where(PackageField.name == field.name)
+ ).first()
+ if existing_field:
+ if existing_field.description != field.description:
+ raise ValueError(
+ f"Field {field.name} already exists with different description"
+ )
+ else:
+ logging.debug(f" Adding field {field.name}")
+ session.add(field)
+
+ package_list = PackageList.from_files()
+ for package in package_list:
+ logging.debug(f"Adding package fields for {package.filepath}")
+ package_ref = PackageReference(package_data=package.model_dump_json())
+ session.add(package_ref)
+ session.flush()
+ for field in package.fields:
+ if field.fields:
+ for sub_field in field.fields:
+ name = f"{field.name}.{sub_field.name}"
+ add_to_db(
+ PackageField(
+ name=name,
+ description=sub_field.description,
+ package_reference_id=package_ref.id,
+ example=sub_field.example,
+ type=sub_field.type,
+ ),
+ session,
+ )
+ else:
+ add_to_db(
+ PackageField(
+ name=field.name,
+ description=field.description,
+ package_reference_id=package_ref.id,
+ example=field.example,
+ type=field.type,
+ ),
+ session,
+ )
+ session.commit()
+
+
+class OverrideQueryResult:
+ """
+ Represents the result of querying for field overrides, prioritized by event, OS, and default.
+
+ This class retrieves and stores a prioritized list of field overrides for a given field name,
+ event name, and OS name from the database. The priority order is: event-specific override (highest),
+ then OS-specific override, and finally the default override (lowest).
+
+ Properties such as `description`, `example`, and `type` return the value from the highest-priority
+ override that provides a non-empty value, or None if none are found.
+
+ Args:
+ session: SQLModel session used to query the database.
+ field_name: Name of the field to retrieve overrides for.
+ event_name: Name of the event to prioritize event-specific overrides.
+ os_name: Name of the OS to prioritize OS-specific overrides.
+ """
+
+ def __init__(
+ self, session: Session, field_name: str, event_name: str, os_names: OsNameList
+ ):
+ """
+ Initialize OverrideQueryResult.
+
+ Args:
+ session: SQLModel session.
+ field_name: Name of the field.
+ event_name: Name of the event.
+ os_name: Name of the OS.
+ """
+ self.overrides: list[OverrideField | None] = []
+
+ overrides = session.exec(
+ select(OverrideRelationship).where(OverrideRelationship.name == field_name)
+ ).all()
+
+ #
+ # These functions resolve the overrides for event, os, and default respectively.
+ #
+ def event_override() -> OverrideField | None:
+ """
+ Returns the event override if it exists, otherwise None.
+ """
+ return next((o.override for o in overrides if o.event == event_name), None)
+
+ def os_override() -> OverrideField | None:
+ """
+ Returns the OS Override if it exists. There can be multiple os overrides, so the relevant
+ ones for this document are saved in markdown table format.
+ """
+ description = None
+ example = None
+ type = None
+ for o in overrides:
+ if o.os:
+ if o.os in os_names:
+ if o.override.description:
+ if not description:
+ description = f"|OS|Description|\n|---|---|\n"
+ description += f"|{o.os}|{o.override.description}|\n"
+ if o.override.example:
+ if not example:
+ example = f"|OS|Example|\n|---|---|\n"
+ example += f"|{o.os}|{o.override.example}|\n"
+ if o.override.type:
+ if not type:
+ type = f"|OS|Type|\n|---|---|\n"
+ type += f"|{o.os}|{o.override.type}|\n"
+
+ return (
+ OverrideField(
+ description=description,
+ example=example,
+ type=type,
+ )
+ if any([description, example, type])
+ else None
+ )
+
+ def default_override() -> OverrideField | None:
+ """
+ Returns the default override if it exists, otherwise None.
+ """
+ return next((o.override for o in overrides if o.default), None)
+
+ # We save the overrides in order of priority, so that we can return the highest-priority override
+ self.overrides = [event_override(), os_override(), default_override()]
+
+ @property
+ def description(self) -> str | None:
+ """
+ Returns the description from the highest-priority override that provides a non-empty value, or None.
+ """
+ for override in self.overrides:
+ if override and override.description:
+ return override.description
+ return None
+
+ @property
+ def example(self) -> str | None:
+ """
+ Returns the example from the highest-priority override that provides a non-empty value, or None.
+ """
+ for override in self.overrides:
+ if override and override.example:
+ return override.example
+ return None
+
+ @property
+ def type(self) -> str | None:
+ """
+ Returns the type from the highest-priority override that provides a non-empty value, or None.
+ """
+ for override in self.overrides:
+ if override and override.type:
+ return override.type
+ return None
+
+
+def getDatabase(db_path: pathlib.Path) -> Engine:
+ """
+ getDatabase creates a database if it does not exist, otherwise it uses the existing database
+
+ This stores the documentation in package/endpoint/data_stream in a lightweight SQLite database. We will
+ use this when generating markdown documentation for the fields defined in the custom_documentation.
+
+ overrides are also added to the database here.
+
+ Args:
+ db_path: path to the database
+
+ Returns:
+ database Engine
+ """
+ if db_path.exists():
+ logging.info(f"Using existing database at {db_path}")
+ return create_engine(f"sqlite:///{db_path}")
+
+ logging.info(f"Creating database at {db_path}")
+ engine = create_engine(f"sqlite:///{db_path}")
+ SQLModel.metadata.create_all(engine)
+ with Session(engine) as session:
+ populate_packages_fields(session)
+ populate_overrides(session)
+ session.commit()
+ return engine
diff --git a/scripts/generate-docs/pydocgen/markdown.py b/scripts/generate-docs/pydocgen/markdown.py
new file mode 100644
index 000000000..785a8caba
--- /dev/null
+++ b/scripts/generate-docs/pydocgen/markdown.py
@@ -0,0 +1,421 @@
+import csv
+import hashlib
+import logging
+import os
+import pathlib
+from typing import List, TextIO
+
+from sqlmodel import Session, select
+
+from .models.custom_documentation import OsNameList
+from .database import OverrideQueryResult, PackageField, getDatabase
+from .models import CustomDocumentationList, Filter
+
+
+def quote_markdown_string(s: str) -> str:
+ """
+ quote_markdown_string prepends each line of a string with a '>' character
+ Args:
+ s: string to quote
+
+ Returns:
+ quoted string
+ """
+ return "\n".join(f"> {line}" for line in s.splitlines())
+
+
+def generate_random_sha256() -> str:
+ """
+ generate_random_sha256 generates a random SHA256 hash for use in example fields
+
+ Returns:
+ random SHA256 hash
+ """
+ return hashlib.sha256(os.urandom(32)).hexdigest()
+
+
+class FieldMetadata:
+ """
+ FieldMetadata contains all the information necessary to generate markdown for a field
+ it queries the package field database for ECS metadata and the overrides database
+ for endpoint-specific metadata. It also generates a random SHA256 hash if the field is a
+ SHA256 hash and no example is provided.f
+ """
+
+ def __init__(
+ self, field: str, session: Session, event_name: str, os_names: OsNameList
+ ) -> None:
+ """
+ __init__ queries the database for ECS metadata and endpoint-specific metadata. Also
+ generates a random SHA256 hash if the field is a SHA256 hash and no example is provided.
+
+ Args:
+ field: field name
+ session: active sqlmodel session for querying the database
+ event_name: name of the event
+ os_name: os name (e.g., "windows", "linux", "macos")
+ """
+ self.field = field
+ self.event_name = event_name
+ self.os_names = os_names
+
+ self.endpoint_description: str | None = None
+ self.endpoint_example: str | None = None
+ self.endpoint_type: str | None = None
+ self.ecs_description: str | None = None
+ self.ecs_example: str | None = None
+ self.ecs_type: str | None = None
+
+ self._populate_ecs_metadata(session)
+ self._populate_endpoint_metadata(session)
+
+ if not self.ecs_example and self.field.endswith(".sha256"):
+ # If the field is a SHA256 hash, generate a random example if none is provided
+ self.ecs_example = generate_random_sha256()
+
+ def _populate_ecs_metadata(self, session: Session) -> None:
+ """
+ _populate_ecs_metadata populates the ECS metadata for a field
+ based on the package field database
+
+ Args:
+ session: SQLAlchemy session
+ """
+ package_field: PackageField | None = session.exec(
+ select(PackageField).where(PackageField.name == self.field)
+ ).first()
+ if package_field:
+ #
+ # The package field description may contain newlines, so we replace them with spaces
+ #
+ self.ecs_description = package_field.description
+ self.ecs_example = package_field.example
+ self.ecs_type = package_field.type
+
+ def _populate_endpoint_metadata(self, session: Session) -> None:
+ """
+ _populate_endpoint_metadata populates the endpoint metadata for a field
+ based on the overrides database
+
+ Args:
+ session: SQLAlchemy session
+ field: field name
+ event_name: event name
+ os_name: OS name
+ """
+ result = OverrideQueryResult(
+ session, self.field, self.event_name, self.os_names
+ )
+ if result.description:
+ self.endpoint_description = result.description
+ if result.example:
+ self.endpoint_example = result.example
+ if result.type:
+ self.endpoint_type = result.type
+
+ def has_data(self) -> bool:
+ """
+ has_data checks if the metadata has any data populated
+
+ Returns:
+ True if any metadata is populated, False otherwise
+ """
+ return any(
+ [
+ self.ecs_description,
+ self.ecs_example,
+ self.ecs_type,
+ self.endpoint_description,
+ self.endpoint_example,
+ self.endpoint_type,
+ ]
+ )
+
+ def missing_data(self) -> bool:
+ """
+ missing_data checks if the metadata is missing any data
+
+ Returns:
+ True if any metadata is missing, False otherwise
+ """
+ return not all(
+ [
+ self.ecs_description,
+ self.ecs_example,
+ self.ecs_type,
+ self.endpoint_description,
+ self.endpoint_example,
+ self.endpoint_type,
+ ]
+ )
+
+ def write_markdown(self, f: TextIO) -> None:
+ """
+ write_markdown writes the field metadata to a markdown file
+ Args:
+ f: file object to write to
+ """
+ f.write(f"### `{self.field}`\n\n")
+ if not self.has_data():
+ f.write("No description or example found\n\n")
+ f.write("
\n\n")
+ return
+
+ if self.ecs_description:
+ f.write("**ECS Description**\n\n")
+ f.write(f"{quote_markdown_string(self.ecs_description)}\n\n")
+ if self.endpoint_description:
+ f.write("**Extended Description**\n\n")
+ f.write(f"{quote_markdown_string(self.endpoint_description)}\n\n")
+ if self.endpoint_example:
+ f.write("**Example**\n\n")
+ f.write(f"{quote_markdown_string(self.endpoint_example)}\n\n")
+ elif self.ecs_example:
+ f.write("**Example**\n\n")
+ f.write(f"{quote_markdown_string(self.ecs_example)}\n\n")
+ if self.endpoint_type:
+ f.write("**Type**\n\n")
+ f.write(f"{quote_markdown_string(self.endpoint_type)}\n\n")
+ elif self.ecs_type:
+ f.write("**Type**\n\n")
+ f.write(f"{quote_markdown_string(self.ecs_type)}\n\n")
+ f.write("
\n\n")
+
+
+class MetadataCsvWriter:
+ """
+ This class will write a CSV file that contains fields
+ that are missing either a description or an example. This
+ can be imported into a spreadsheet to track missing documentation
+ """
+
+ FIELD_NAME = "Field Name"
+ FIELD_EVENT_NAME = "Event Name"
+ FIELD_HAS_ECS_DESCRIPTION = "Has ECS Description"
+ FIELD_HAS_ECS_EXAMPLE = "Has ECS Example"
+ FIELD_HAS_ECS_TYPE = "Has ECS Type"
+ FIELD_HAS_ENDPOINT_DESCRIPTION = "Has Endpoint Description"
+ FIELD_HAS_ENDPOINT_EXAMPLE = "Has Endpoint Example"
+
+ def __init__(self, csv_path: pathlib.Path):
+
+ self.csv_path = csv_path
+ self.fields = [
+ self.FIELD_NAME,
+ self.FIELD_EVENT_NAME,
+ self.FIELD_HAS_ECS_DESCRIPTION,
+ self.FIELD_HAS_ECS_EXAMPLE,
+ self.FIELD_HAS_ECS_TYPE,
+ self.FIELD_HAS_ENDPOINT_DESCRIPTION,
+ self.FIELD_HAS_ENDPOINT_EXAMPLE,
+ ]
+ self.rows = []
+
+ def add_row(self, field: FieldMetadata):
+ """
+ add_row adds a row to the CSV output
+
+ Args:
+ field: FieldMetadata object containing the field information
+ """
+ self.rows.append(
+ {
+ self.FIELD_NAME: field.field,
+ self.FIELD_EVENT_NAME: field.event_name,
+ self.FIELD_HAS_ECS_DESCRIPTION: bool(field.ecs_description),
+ self.FIELD_HAS_ECS_EXAMPLE: bool(field.ecs_example),
+ self.FIELD_HAS_ECS_TYPE: bool(field.ecs_type),
+ self.FIELD_HAS_ENDPOINT_DESCRIPTION: bool(field.endpoint_description),
+ self.FIELD_HAS_ENDPOINT_EXAMPLE: bool(field.endpoint_example),
+ }
+ )
+
+ def write_csv(self):
+ """
+ write_csv writes the collected rows to a CSV file
+ """
+ logging.debug(f"Generating CSV output at {self.csv_path}")
+ with self.csv_path.open("w", newline="") as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=self.fields)
+ writer.writeheader()
+ for row in self.rows:
+ writer.writerow(row)
+
+
+def generate_custom_documentation_markdown(
+ db_path: pathlib.Path,
+ output_dir: pathlib.Path,
+ csv_path: pathlib.Path | None = None,
+):
+ """
+ Generate markdown files for custom documentation
+ """
+
+ def get_output_filepath(src_path: pathlib.Path) -> pathlib.Path:
+ """
+ get_output_filepath determines the output filename for writing markdown, based
+ on the source path of the package
+
+ Args:
+ src_path: yaml file path
+
+ Returns:
+ output filepath
+ """
+ parts = src_path.parts
+ index = parts.index("data_stream")
+ output_filename = output_dir
+ for part in parts[index + 1 : -1]:
+ output_filename = output_filename / part
+ return output_filename / parts[-1].replace(".yaml", ".md")
+
+ def get_formatted_os_name(os: str) -> str:
+ """
+ get_formatted_os_name os names in the defintions are always lowercase, this function will
+ return the correct case for the os name
+
+ Args:
+ os: os name
+
+ Returns:
+ _description_
+ """
+ match os:
+ case "windows":
+ return "Windows"
+ case "linux":
+ return "Linux"
+ case "macos":
+ return "macOS"
+ case _:
+ raise ValueError(
+ f"Unknown OS name: {os}. Please add it to the get_formatted_os_name function."
+ )
+
+ def get_formatted_os_string(os_list: OsNameList) -> str:
+ """
+ get_formatted_os_string some documents have multiple os's, this function will format them
+ correctly for the markdown output
+
+ Args:
+ os_list: list of os names
+
+ Returns:
+ formatted os string
+ """
+ return ", ".join(get_formatted_os_name(os) for os in os_list)
+
+ def get_kql_query_string(filter: Filter) -> str:
+ """
+ get_kql_query_string generates a KQL query string from a Filter object
+
+ Args:
+ filter: Filter object from the custom documentation
+
+ Returns:
+ KQL query string
+ """
+ queries = []
+ for field, metadata in Filter.model_fields.items():
+ field_name = metadata.alias if metadata.alias else field
+ if field in filter.dict():
+ field_value = filter.dict()[field]
+ if not field_value:
+ continue
+ if isinstance(field_value, list):
+ if len(field_value) == 1:
+ field_value = f"{field_value[0]}"
+ else:
+ field_value = '" or "'.join(field_value)
+ queries.append(f'{field_name} : "{field_value}"')
+ return " and ".join(queries)
+
+ #
+ # Function Begin
+ #
+
+ # Create or get the populated database
+ engine = getDatabase(db_path)
+
+ # Get the custom documentation
+ custom_docs = CustomDocumentationList.from_files()
+
+ csv_writer: MetadataCsvWriter | None = None
+ if csv_path:
+ csv_writer = MetadataCsvWriter(csv_path)
+
+ # Generate markdown for each custom document
+ with Session(engine) as session:
+ for custom_doc in custom_docs:
+
+ # Get the output filename and create the parent directories
+ output_filename = get_output_filepath(custom_doc.filepath)
+ output_filename.parent.mkdir(parents=True, exist_ok=True)
+
+ # Write the markdown file
+ with output_filename.open("w") as f:
+ f.write(f"# {custom_doc.overview.name}\n\n")
+
+ f.write(f"## Description\n\n")
+ f.write(f"{custom_doc.overview.description}\n\n")
+
+ f.write("## Overview\n\n")
+ f.write("
OS | \n") + f.write( + f"{get_formatted_os_string(custom_doc.identification.os)} | \n" + ) + f.write("
Data Stream | \n") + f.write(f"{custom_doc.identification.data_stream} | \n") + f.write("
KQL Query | \n") + f.write( + f"{get_kql_query_string(custom_doc.identification.filter)} | \n"
+ )
+ f.write("