diff --git a/plugin/skills/azure-data-collection-rules/SKILL.md b/plugin/skills/azure-data-collection-rules/SKILL.md new file mode 100644 index 000000000..efb3df1fc --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/SKILL.md @@ -0,0 +1,44 @@ +--- +name: azure-data-collection-rules +description: "Author, edit, validate, and deploy Azure Monitor Data Collection Rules (DCRs), Log Analytics workspace tables, and KQL ingestion-time transformations. Covers single-stage and multi-stage transformation DCRs, client-side and ingestion-side processors, stream declarations, and custom table creation. Also covers direct ingestion DCRs for the Log Ingestion API. WHEN: create DCR, edit DCR, data collection rule, DCR JSON, add transformation, KQL transform, custom table, stream declaration, multi-stage transformation, processor, client-side transform, ingestion-time transform, parse JSON logs, filter syslog, aggregate events, custom log table, DCR schema, DCR authoring, rename columns, drop columns, CEF parsing, XML parsing, data collection, Log Ingestion API, direct ingestion, send custom logs, custom log ingestion, logs ingestion endpoint, DCR endpoint." +argument-hint: "Describe the data source type, desired transformations, and destination table" +license: MIT +metadata: + author: Microsoft + version: "1.0.0" +--- + +# DCR Authoring Skill + +Author, validate, and deploy Azure Monitor Data Collection Rules with single-stage, multi-stage, or direct ingestion configurations. + +## Procedure + +Follow the [full procedure](./references/procedure.md): + +1. **Gather requirements** — ingestion method, data source, intent, destination, split/copy needs +2. **Determine DCR kind** — per [DCR kinds guide](./references/dcr-kinds.md) +3. **Design transformation pipeline** — native filters, processors, or `transformKql` +4. **Author the DCR** — per [DCR schema](./references/dcr-schema.md) and [routing rules](./references/destination-routing.md) +5. **Validate** — run [validate-dcr.ps1](./scripts/validate-dcr.ps1) +6. **Deploy** — run [put-dcr.ps1](./scripts/put-dcr.ps1), prepare tables via [create-custom-table.ps1](./scripts/create-custom-table.ps1) +7. **Verify** — query destination table, check `_LogOperation` for errors + +## References + +- [Procedure](./references/procedure.md) — full step-by-step workflow +- [DCR kinds](./references/dcr-kinds.md) — kind selection, data source types, transformation sections +- [DCR schema](./references/dcr-schema.md) — top-level structure, column constraints, dataFlows, transformations, REST API +- [Stream declarations](./references/stream-declarations.md) — custom stream schemas (Direct + logFiles only) +- [Processors: headers](./references/processors-headers.md) — header processor types, stage availability, output columns +- [Processors: operations](./references/processors-operations.md) — filter, map, parse, aggregate, enrich, KQL syntax +- [Processor heuristics: filters](./references/processor-heuristics-filters.md) — native filter check, filtering intent map +- [Processor heuristics: transforms](./references/processor-heuristics-transforms.md) — parsing, schema, aggregation, enrichment, routing intent maps +- [Processor heuristics: staging](./references/processor-heuristics-staging.md) — stage placement, cost optimization, multi-processor chains +- [Destination routing](./references/destination-routing.md) — stream-to-table mapping +- [Supported tables](./references/supported-tables.md) — standard tables accepting custom streams +- [KQL transforms](./references/kql-transforms.md) — common KQL patterns +- [LA tables](./references/la-tables.md) — table creation, plans +- [Direct ingestion](./references/direct-ingestion.md) — Log Ingestion API DCRs +- [Decision guide](./references/decision-guide.md) — scenario routing table +- [Limits](./references/limits.md) — DCR structure limits, column constraints, API quotas diff --git a/plugin/skills/azure-data-collection-rules/examples/custom-json-log.json b/plugin/skills/azure-data-collection-rules/examples/custom-json-log.json new file mode 100644 index 000000000..47a7feed2 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/examples/custom-json-log.json @@ -0,0 +1,75 @@ +{ + "location": "eastus", + "properties": { + "streamDeclarations": { + "Custom-JsonAppLogs": { + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "Computer", "type": "string" }, + { "name": "Level", "type": "string" }, + { "name": "UserId", "type": "string" }, + { "name": "RequestId", "type": "string" }, + { "name": "Message", "type": "string" }, + { "name": "Duration", "type": "real" } + ] + } + }, + "dataSources": { + "logFiles": [ + { + "name": "appJsonLogs", + "filePatterns": ["/var/log/myapp/*.json"], + "format": "json", + "streams": ["Custom-JsonAppLogs"], + "transform": "parse_app_json" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "name": "myWorkspace", + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "workspaceId": "{workspace-guid}" + } + ] + }, + "dataFlows": [ + { + "streams": ["Custom-JsonAppLogs"], + "destinations": ["myWorkspace"], + "outputStream": "Custom-AppLogs_CL", + "transformKql": "source | where Level != 'Debug' | project-away Duration" + } + ], + "transformations": [ + { + "name": "parse_app_json", + "headerProcessor": { + "processor": "header.TextLog" + }, + "processors": [ + { + "processor": "parse.JsonPath", + "configuration": { + "columnName": "RawData", + "all": [ + { "path": "$.level", "nameAs": "Level", "typeAs": "string" }, + { "path": "$.userId", "nameAs": "UserId", "typeAs": "string" }, + { "path": "$.requestId", "nameAs": "RequestId", "typeAs": "string" }, + { "path": "$.message", "nameAs": "Message", "typeAs": "string" }, + { "path": "$.durationMs", "nameAs": "Duration", "typeAs": "real" } + ] + } + }, + { + "processor": "map.Drop", + "configuration": { + "columnNames": ["RawData", "FilePath"] + } + } + ] + } + ] + } +} diff --git a/plugin/skills/azure-data-collection-rules/examples/direct-ingestion-custom-table.json b/plugin/skills/azure-data-collection-rules/examples/direct-ingestion-custom-table.json new file mode 100644 index 000000000..6daf1689c --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/examples/direct-ingestion-custom-table.json @@ -0,0 +1,35 @@ +{ + "location": "eastus", + "kind": "Direct", + "properties": { + "streamDeclarations": { + "Custom-MyAppLogs": { + "columns": [ + { "name": "Time", "type": "datetime" }, + { "name": "Computer", "type": "string" }, + { "name": "Application", "type": "string" }, + { "name": "Level", "type": "string" }, + { "name": "Message", "type": "string" }, + { "name": "RequestId", "type": "string" }, + { "name": "DurationMs", "type": "real" } + ] + } + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "name": "myworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": ["Custom-MyAppLogs"], + "destinations": ["myworkspace"], + "transformKql": "source | project TimeGenerated = Time, Computer, Application, Level, Message, RequestId, DurationMs | where Level != 'Debug'", + "outputStream": "Custom-AppLogs_CL" + } + ] + } +} diff --git a/plugin/skills/azure-data-collection-rules/examples/perf-counter-aggregation.json b/plugin/skills/azure-data-collection-rules/examples/perf-counter-aggregation.json new file mode 100644 index 000000000..77c0b1da3 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/examples/perf-counter-aggregation.json @@ -0,0 +1,72 @@ +{ + "location": "eastus", + "properties": { + "streamDeclarations": { + "Custom-PerfSummary": { + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "Host", "type": "string" }, + { "name": "CounterName", "type": "string" }, + { "name": "AvgValue", "type": "real" }, + { "name": "MaxValue", "type": "real" }, + { "name": "RecordCount", "type": "int" } + ] + } + }, + "dataSources": { + "performanceCounters": [ + { + "name": "perfCpu", + "samplingFrequencyInSeconds": 10, + "counterSpecifiers": [ + "\\Processor(_Total)\\% Processor Time", + "\\Memory\\Available MBytes" + ], + "streams": ["Custom-PerfSummary"], + "transform": "aggregate_perf" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "name": "myWorkspace", + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "workspaceId": "{workspace-guid}" + } + ] + }, + "dataFlows": [ + { + "streams": ["Custom-PerfSummary"], + "destinations": ["myWorkspace"], + "outputStream": "Custom-PerfSummary_CL" + } + ], + "transformations": [ + { + "name": "aggregate_perf", + "headerProcessor": { + "processor": "header.WindowsPerformanceCounters" + }, + "processors": [ + { + "processor": "aggregate.Basic", + "configuration": { + "batchingSettings": { + "timeWindow": "5m", + "maxBatchRows": 1000 + }, + "aggregates": [ + { "columnName": "CounterValue", "operator": "avg", "nameAs": "AvgValue" }, + { "columnName": "CounterValue", "operator": "max", "nameAs": "MaxValue" }, + { "operator": "count", "nameAs": "RecordCount" } + ], + "dimensionColumns": ["Host", "CounterName"] + } + } + ] + } + ] + } +} diff --git a/plugin/skills/azure-data-collection-rules/examples/syslog-filter-drop.json b/plugin/skills/azure-data-collection-rules/examples/syslog-filter-drop.json new file mode 100644 index 000000000..f87b26896 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/examples/syslog-filter-drop.json @@ -0,0 +1,54 @@ +{ + "location": "eastus", + "properties": { + "dataSources": { + "syslog": [ + { + "name": "syslogAuth", + "facilityNames": ["auth", "authpriv"], + "logLevels": ["Warning", "Error", "Critical", "Alert", "Emergency"], + "streams": ["Microsoft-Syslog"], + "transform": "drop_auth_columns" + }, + { + "name": "syslogOther", + "facilityNames": ["daemon", "kern"], + "logLevels": ["Error", "Critical", "Alert", "Emergency"], + "streams": ["Microsoft-Syslog"] + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "name": "myWorkspace", + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "workspaceId": "{workspace-guid}" + } + ] + }, + "dataFlows": [ + { + "streams": ["Microsoft-Syslog"], + "destinations": ["myWorkspace"], + "outputStream": "Microsoft-Syslog" + } + ], + "transformations": [ + { + "name": "drop_auth_columns", + "headerProcessor": { + "processor": "header.Syslog" + }, + "processors": [ + { + "processor": "map.Drop", + "configuration": { + "columnNames": ["ProcessId", "HostIP"] + } + } + ] + } + ] + } +} diff --git a/plugin/skills/azure-data-collection-rules/examples/windows-events-split.json b/plugin/skills/azure-data-collection-rules/examples/windows-events-split.json new file mode 100644 index 000000000..159fc270e --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/examples/windows-events-split.json @@ -0,0 +1,83 @@ +{ + "location": "eastus", + "properties": { + "streamDeclarations": { + "Custom-SecurityEvents": { + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "Computer", "type": "string" }, + { "name": "EventID", "type": "int" }, + { "name": "Channel", "type": "string" }, + { "name": "UserName", "type": "string" }, + { "name": "EventDescription", "type": "string" } + ] + } + }, + "dataSources": { + "windowsEventLogs": [ + { + "name": "securityEvents", + "xPathQueries": [ + "Security!*[System[(EventID=4624 or EventID=4625 or EventID=4648)]]" + ], + "streams": ["Custom-SecurityEvents"], + "transform": "parse_security_events" + }, + { + "name": "systemEvents", + "xPathQueries": [ + "System!*[System[(Level=1 or Level=2 or Level=3)]]" + ], + "streams": ["Microsoft-Event"] + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "name": "myWorkspace", + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "workspaceId": "{workspace-guid}" + } + ] + }, + "dataFlows": [ + { + "streams": ["Custom-SecurityEvents"], + "destinations": ["myWorkspace"], + "outputStream": "Custom-SecurityEvents_CL" + }, + { + "streams": ["Microsoft-Event"], + "destinations": ["myWorkspace"], + "outputStream": "Microsoft-Event" + } + ], + "transformations": [ + { + "name": "parse_security_events", + "headerProcessor": { + "processor": "header.WindowsEvents" + }, + "processors": [ + { + "processor": "parse.XmlPath", + "configuration": { + "columnName": "RawXml", + "all": [ + { "path": "/Event/System/EventID", "nameAs": "EventID", "typeAs": "int" }, + { "path": "/Event/EventData/Data[@Name='TargetUserName']", "nameAs": "UserName", "typeAs": "string" } + ] + } + }, + { + "processor": "map.Drop", + "configuration": { + "columnNames": ["RawXml", "RenderingInfo", "EventRecordId", "PublisherId"] + } + } + ] + } + ] + } +} diff --git a/plugin/skills/azure-data-collection-rules/references/dcr-kinds.md b/plugin/skills/azure-data-collection-rules/references/dcr-kinds.md new file mode 100644 index 000000000..c73de240a --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/dcr-kinds.md @@ -0,0 +1,76 @@ +# DCR Kinds Guide + +Choose the DCR kind based on the data collection scenario, then use the available transformation sections for that kind. + +## Kind Selection + +| Scenario | `kind` value | Notes | +|---|---|---| +| Data from Log Ingestion API (apps, scripts, log forwarders like Logstash/Fluentbit) | `Direct` | Endpoints auto-created. See [direct ingestion](./direct-ingestion.md). | +| AMA on Linux VMs/VMSS/containers | `Linux` | Syslog, perf counters, text/JSON log files | +| AMA on Windows VMs/VMSS/containers | `Windows` | Windows event logs, perf counters, IIS logs | +| Ingestion-time transform on diagnostic settings data | `WorkspaceTransforms` | No input stream. One per workspace. Must link DCR ↔ workspace. | + +Avoid cross-OS DCRs unless required. Use OS-specific `kind` for AMA DCRs. + +## Agent-Based DCRs (`kind: "Linux"` or `kind: "Windows"`) + +### Data Source Types + +The `dataSources` section defines what to collect. Each data source specifies collection settings, output streams, and optional client-side transform. + +| Property | Type | Required | Description | +|----------|------|----------|-------------| +| `name` | string | Yes | Unique identifier within the DCR | +| `streams` | string[] | Yes | Output streams. `Microsoft-*` for standard, `Custom-*` for custom | +| `transform` | string | No | Reference to a named transformation (multi-stage only) | + +| Type | Standard Streams | Key Parameters | +|------|-----------------|----------------| +| `syslog` | `Microsoft-Syslog`, `Microsoft-CommonSecurityLog` | `facilityNames`, `logLevels` | +| `windowsEventLogs` | `Microsoft-Event` | `xPathQueries` | +| `performanceCounters` | `Microsoft-Perf`, `Microsoft-InsightsMetrics` | `samplingFrequencyInSeconds`, `counterSpecifiers` | +| `logFiles` | Custom only | `filePatterns`, `format` ("json" or "text") | +| `iisLogs` | `Microsoft-W3CIISLog` | `logDirectories` | + +Multiple data sources of the same type are allowed, each with its own transform and streams. + +### Available Transformation Sections (design in this order) + +| # | Section | Runs at | Purpose | +|---|---|---|---| +| 1 | `dataSources` native parameters | Agent (pre-collection) | Filter at source: `facilityNames`, `logLevels`, `xPathQueries`, `counterSpecifiers`, `filePatterns`. No cost, no API version requirement. | +| 2 | `dataSources[].transform` reference | Agent (client-side) | Processors: filter, parse, aggregate, map, enrich. Reduces network egress. Requires `transformations` section + API `2025-05-11`+. | +| 3 | `dataFlows[].transformKql` | Pipeline (ingestion-side) | Single-stage KQL transform. Cannot combine with `transform` on same data flow. | +| 4 | `dataFlows[].transform` reference | Pipeline (ingestion-side) | Processor chain on ingestion side. Requires `transformations` section + API `2025-05-11`+. | + +Consult [processor heuristics](./processor-heuristics.md) for processor selection, stage placement, and ordering. + +## Direct Ingestion DCR (`kind: "Direct"`) + +Used when data arrives via the Log Ingestion API, including upstream log forwarding tools (Logstash, Fluentbit, etc.). + +Available transformation sections: + +| Section | Purpose | +|---|---| +| `streamDeclarations` | Define incoming JSON schema (required, keys must start with `Custom-`) | +| `dataFlows[].transformKql` | KQL to map/filter/transform incoming data to destination schema | + +**Not available:** `dataSources`, native filters, client-side processors, `transformations` section. + +**Design order:** (1) define input stream schema in `streamDeclarations`, (2) write `transformKql` to map to destination. See [direct ingestion reference](./direct-ingestion.md). + +## Workspace Transform DCR (`kind: "WorkspaceTransforms"`) + +Adds ingestion-time transformations to data arriving via diagnostic settings or any non-DCR workflow. The transform applies automatically to all data sent to the specified table (unless sent via its own dedicated DCR). + +**Critical constraints the agent MUST enforce:** +- Only **one** workspace transform DCR can be linked to a given LA workspace +- Before creating: **check if a workspace transform DCR already exists** for the target workspace (query for DCRs with `kind: "WorkspaceTransforms"` in the same resource group/subscription). If one exists, add the new table transform as an additional `dataFlows` entry to the existing DCR. +- The DCR must reference the workspace as a destination, AND the workspace must be linked back to the DCR via `defaultDataCollectionRuleResourceId` +- No `dataSources` section. Streams use `Microsoft-Table-{TableName}` format. + +**Structure:** No `dataSources` section. Streams use `Microsoft-Table-{TableName}` format. Each table transform is a separate `dataFlows` entry. After deploying, link workspace to DCR via `defaultDataCollectionRuleResourceId` property on the workspace resource. + +See [processor-heuristics-filters.md](./processor-heuristics-filters.md) for native filter reference. diff --git a/plugin/skills/azure-data-collection-rules/references/dcr-schema.md b/plugin/skills/azure-data-collection-rules/references/dcr-schema.md new file mode 100644 index 000000000..9f67ce02c --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/dcr-schema.md @@ -0,0 +1,109 @@ +# DCR Schema Reference + +API version: `2025-05-11` (required for multi-stage transformations). +Earlier API versions support single-stage only (`transformKql` in dataFlows). + +## Top-Level Structure + +```jsonc +{ + "location": "{azureRegion}", + "kind": "{kind}", // See kind table below. Required for all DCR types. + "properties": { + "streamDeclarations": { }, // Custom stream schemas + "dataSources": { }, // What to collect (omit for kind: "Direct") + "destinations": { }, // Where to send + "dataFlows": [ ], // Ingestion-side routing + transforms + "transformations": [ ] // Named transform definitions (multi-stage) + } +} +``` + +## kind + +| Value | Use Case | +|-------|----------| +| `"Linux"` | AMA collecting from Linux VMs/VMSS/containers. Preferred over omitting kind. | +| `"Windows"` | AMA collecting from Windows VMs/VMSS/containers. Preferred over omitting kind. | +| `"Direct"` | Log Ingestion API (apps, scripts, upstream forwarders like Logstash/Fluentbit). Auto-generates `logsIngestion` endpoint. No `dataSources` section. | +| `"WorkspaceTransforms"` | Ingestion-time transforms on diagnostic settings or other non-DCR data. One per workspace. No `dataSources` or `streamDeclarations`. Streams use `Microsoft-Table-{TableName}`. | +| `"AgentSettings"` | Configure AMA agent parameters (not for data collection). | +| `"PlatformTelemetry"` | Export platform metrics. | + +See [DCR kinds guide](./dcr-kinds.md) for kind selection logic, available transformation sections, and design order per kind. + +## Column Constraints + +**Stream declaration columns (input schema):** + +| Constraint | Limit | +|---|---| +| Max column name length | 60 characters | +| Max columns per stream | 1,000 | +| Column name format | Must start with a letter. Only alphanumeric characters and underscores (`_`). | +| Reserved names | `_ResourceId`, `id`, `_SubscriptionId`, `TenantId`, `Type`, `UniqueId`, `Title` | + +**Destination table columns (output schema):** + +| Constraint | Limit | +|---|---| +| Max column name length | 45 characters | +| Max columns per table | 500 | +| `TimeGenerated` (datetime) | **Required in every LA table** (standard and custom). Transformation output must always include this column. | +| Custom columns on standard tables | Must use `_CF` suffix | + +**Critical:** The overall transformation flow (processors + `transformKql`) must ensure the final output includes `TimeGenerated` (datetime). This is a prerequisite for all Log Analytics tables. If the input lacks it, the transform must generate it (e.g., `| extend TimeGenerated = now()`). + +Stream declaration limits are more permissive than table limits. The transformation flow must map input columns to names/counts that fit within the destination table constraints. + +## destinations + +```jsonc +"destinations": { + "logAnalytics": [ + { + "name": "myWorkspace", + "workspaceResourceId": "/subscriptions/.../workspaces/{name}", + "workspaceId": "guid" + } + ] +} +``` + +## dataFlows + +Ingestion-side routing and transforms. + +| Property | Type | Required | Description | +|----------|------|----------|-------------| +| `streams` | string[] | Yes | Input streams. Single stream if transform is applied | +| `destinations` | string[] | Yes | References to `destinations` entries | +| `transform` | string | No | Named transformation reference. **Mutually exclusive** with `transformKql` | +| `transformKql` | string | No | Inline KQL expression. **Mutually exclusive** with `transform` | +| `outputStream` | string | Conditional | Target table. `Microsoft-*` for standard, `Custom-*_CL` for custom. Required for non-default routing | + +- Multiple dataFlows can consume the same stream (split to different tables) +- One stream can only target one LA workspace per DCR +- A DCR can mix `transform` and `transformKql` across different dataFlows + +See [routing rules](./destination-routing.md) for stream-to-table routing patterns. + +## transformations + +Array of named transformation definitions (multi-stage only). + +| Property | Type | Required | Description | +|----------|------|----------|-------------| +| `name` | string | Yes | Unique name, referenced by dataSources/dataFlows `transform` | +| `headerProcessor` | object | Yes | Header processor establishing starting schema | +| `processors` | object[] | No | Ordered sequence of transformation processors | + +### Context Rules + +- **Client-side transforms** (referenced from dataSources): use data-source-specific headers (`header.Syslog`, `header.WindowsEvents`, etc.) +- **Ingestion-side transforms** (referenced from dataFlows): use `header.StandardStream` or `header.CustomStream` +- Same transformation can be reused across multiple data sources/flows if headers are compatible + +See [processors-headers.md](./processors-headers.md) and [processors-operations.md](./processors-operations.md) for processor types. See [KQL transforms](./kql-transforms.md) for `transformKql` syntax. + +For REST API usage (PUT/GET), see [put-dcr.ps1](../scripts/put-dcr.ps1) and [get-dcr.ps1](../scripts/get-dcr.ps1). API version: `2025-05-11`. diff --git a/plugin/skills/azure-data-collection-rules/references/decision-guide.md b/plugin/skills/azure-data-collection-rules/references/decision-guide.md new file mode 100644 index 000000000..141a55a93 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/decision-guide.md @@ -0,0 +1,15 @@ +# DCR Design Decision Guide + +Quick scenario-to-approach mapping for common DCR configurations. + +| Scenario | Approach | +|----------|----------| +| Simple KQL transform on ingestion | Use `transformKql` in dataFlows (single-stage, no `transformations` section needed) | +| Client-side filtering/parsing | Use `transformations` section with appropriate header + processors, reference from dataSource `transform` | +| Client-side + ingestion-side | Use `transformations` for both, reference from dataSource and dataFlow respectively | +| Same logs to multiple tables | Multiple dataFlows consuming the same stream with different filters and `outputStream` (split); or identical routing to multiple tables (copy) | +| Aggregation | Route aggregated data to a custom table (schema changes entirely) | +| Mix old and new style | Allowed; some dataFlows can use `transformKql`, others can use `transform` | +| Send custom data from app/script | Use direct ingestion DCR (`kind: "Direct"`, no `dataSources`). See [direct ingestion](./direct-ingestion.md) | +| API ingestion with schema mapping | Direct DCR with `transformKql` to map incoming JSON to destination table schema | +| API ingestion to standard table | Direct DCR with `outputStream: "Microsoft-{Table}"` and appropriate transform | diff --git a/plugin/skills/azure-data-collection-rules/references/destination-routing.md b/plugin/skills/azure-data-collection-rules/references/destination-routing.md new file mode 100644 index 000000000..73afc0d2b --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/destination-routing.md @@ -0,0 +1,39 @@ +# Destination Routing Rules + +Rules governing how streams map to destination tables in Log Analytics. + +## Core Routing Rules + +| # | Rule | Example | +|---|------|---------| +| 1 | **Standard stream → its default standard table** works implicitly. `outputStream` is optional. | `Microsoft-Syslog` → Syslog table | +| 2 | **Custom stream → custom table** always works. Table name must end with `_CL`. | `Custom-MyLogs` → `Custom-MyLogs_CL` | +| 3 | **Custom stream → supported standard table** works only for tables on the [supported list](#standard-tables-accepting-custom-streams). | `Custom-MyEvents` → `Microsoft-Event` (Event is on the list) | +| 4 | **Custom stream → unsupported standard table** is **not allowed**. Route to a custom table instead (`Custom-*_CL`). In direct ingestion, only custom streams are available, so this is the only option. | Cannot send `Custom-X` to a table not on the supported list → create a custom table | +| 5 | **Standard stream → custom table** requires `outputStream` set to `Custom-{Table}_CL` and a `transformKql` (even a pass-through `"source"` works). Both `outputStream` and `transformKql` are required for this routing. | Split syslog: `Microsoft-Syslog` + `outputStream: "Custom-SyslogArchive_CL"` + `transformKql: "source"` | +| 6 | **`outputStream` is required** when routing to a non-default table. | Routing `Microsoft-Syslog` to `Custom-FilteredSyslog_CL` requires `outputStream` | +| 7 | **One stream → one LA workspace per DCR**. Split/copy to multiple tables in the same workspace is fine. | Multiple dataFlows with different `outputStream` but same workspace destination | +| 8 | **Custom destination tables must exist before DCR deployment.** The DCR references the table by name; if it doesn't exist, deployment or data flow will fail. | Create `Custom-AppLogs_CL` via API/portal before PUT on the DCR | +| 9 | **Destination table schema must accommodate transformation output.** If the transform produces columns not present in the destination table, those columns must be added to the table before deploying the DCR. Applies to both standard tables (use `_CF` suffix) and custom tables. | Transform adds `ParsedHost` → add `ParsedHost_CF` on standard table, or `ParsedHost` on custom table | + +**outputStream format:** Standard table default: omit or `Microsoft-{TableName}`. Non-default standard: `Microsoft-{TableName}`. Custom table: `Custom-{TableName}_CL`. + +Column constraints: see [DCR schema](./dcr-schema.md#column-constraints). + +## Transform-Derived Stream Rules + +When an agent-based data source has a `transform` that modifies the output schema (e.g., `map.Drop`, `map.Rename`, `parse.*`): + +1. The data source `streams` must use a `Custom-*` stream name (not `Microsoft-*`) +2. This custom stream is **implicitly derived** from the transform output — do NOT add it to `streamDeclarations` +3. The dataFlow routes the `Custom-*` stream to the destination using `outputStream` (e.g., `outputStream: "Microsoft-Syslog"` for standard tables on the supported list) +4. `streamDeclarations` is only for: direct ingestion custom streams, logFiles custom streams, or custom streams not derived from a transform + +## Routing Decision Logic + +1. **Custom table destination?** Use custom stream. Set `outputStream` to `Custom-{Table}_CL`. Create table before deploying. +2. **Standard table, schema matches exactly?** Use standard stream. `outputStream` optional. +3. **Standard table, transform adds columns?** Add columns with `_CF` suffix before deploying. +4. **Standard table, schema differs, table on [supported list](./supported-tables.md)?** Use custom stream + `outputStream: "Microsoft-{Table}"` + `transformKql` to map. +5. **Standard table, schema differs, NOT on supported list?** Use standard stream + `transformKql` to reshape, or route to custom table instead. +6. **Custom columns on standard table?** Column names must end with `_CF`. diff --git a/plugin/skills/azure-data-collection-rules/references/direct-ingestion.md b/plugin/skills/azure-data-collection-rules/references/direct-ingestion.md new file mode 100644 index 000000000..7b85f3a49 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/direct-ingestion.md @@ -0,0 +1,141 @@ +# Direct Ingestion via Log Ingestion API + +## Overview + +The Log Ingestion API lets you send data directly to a Log Analytics workspace via REST API or client libraries, without an agent. The DCR for direct ingestion uses `"kind": "Direct"` and has no `dataSources` section. + +## Architecture + +``` +Your App/Script → (HTTPS POST) → DCR Logs Ingestion Endpoint → DCR Transform → LA Workspace Table +``` + +## Required Components + +| Component | Purpose | +|-----------|---------| +| **Entra app registration** | Authenticate API calls (client credentials flow) | +| **Custom or standard table** | Destination in LA workspace (custom tables must end with `_CL`) | +| **DCR with `kind: "Direct"`** | Defines input schema, transform, and destination | +| **RBAC assignment** | Grant app the **Monitoring Metrics Publisher** role on the DCR | +| **DCE** (optional) | Required only if using private link; otherwise use DCR's built-in `logsIngestion` endpoint | + +## DCR Structure for Direct Ingestion + +Direct ingestion DCRs differ from agent-based DCRs: +- Must have `"kind": "Direct"` at top level +- No `dataSources` section +- `streamDeclarations` defines the shape of incoming JSON data (not the destination table) +- `transformKql` maps incoming data to the destination table schema +- DCR auto-generates a `logsIngestion` endpoint when `kind: "Direct"` is set + +```jsonc +{ + "location": "{region}", + "kind": "Direct", + "properties": { + "streamDeclarations": { + "Custom-{StreamName}": { + "columns": [ + // Schema of INCOMING data, not the destination table + ] + } + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{ws}", + "name": "myworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": ["Custom-{StreamName}"], + "destinations": ["myworkspace"], + "transformKql": "source | ...", + "outputStream": "Custom-{TableName}_CL" + } + ] + } +} +``` + +## API Endpoint + +After DCR creation, retrieve the logs ingestion endpoint from the DCR resource: + +``` +GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.Insights/dataCollectionRules/{name}?api-version=2023-03-11 +``` + +The endpoint is in `properties.endpoints.logsIngestion`. + +## Sending Data + +### REST API Call + +``` +POST {logsIngestionEndpoint}/dataCollectionRules/{dcrImmutableId}/streams/{streamName}?api-version=2023-01-01 +Authorization: Bearer {token} +Content-Type: application/json + +[ + { + "TimeGenerated": "2026-04-27T12:00:00Z", + "Column1": "value1", + "Column2": 42 + } +] +``` + +- Body must be a JSON array +- Each object must match the stream declaration schema +- UTF-8 encoded +- Max 1 MB per call +- Supports `Content-Encoding: gzip` + +### Authentication + +Token audience (scope): `https://monitor.azure.com/.default` + +```powershell +$tokenBody = @{ + client_id = $appId + scope = "https://monitor.azure.com/.default" + client_secret = $appSecret + grant_type = "client_credentials" +} +$uri = "https://login.microsoftonline.com/$tenantId/oauth2/v2.0/token" +$bearerToken = (Invoke-RestMethod -Uri $uri -Method Post -Body $tokenBody -ContentType "application/x-www-form-urlencoded").access_token +``` + +### Client Libraries + +| Language | Package | +|----------|---------| +| .NET | `Azure.Monitor.Ingestion` | +| Python | `azure-monitor-ingestion` | +| Java | `azure-monitor-ingestion` | +| JavaScript | `@azure/monitor-ingestion` | +| Go | `azlogs` | + +## Setup Procedure + +1. **Create Entra app registration** with a client secret +2. **Create custom table** in LA workspace (if not using an existing table) +3. **Create DCR** with `kind: "Direct"`, stream declarations matching incoming data, and `transformKql` mapping to destination table +4. **Assign RBAC**: grant the app **Monitoring Metrics Publisher** role on the DCR +5. **Retrieve endpoint**: get `logsIngestion` URI and `immutableId` from the DCR +6. **Send data**: POST JSON array to the endpoint + +## Key Differences from Agent-Based DCRs + +| Aspect | Agent-based | Direct | +|--------|------------|--------| +| `kind` | `"Linux"` / `"Windows"` | `"Direct"` | +| `dataSources` | required | absent | +| `streamDeclarations` | raw/post-transform data | incoming API payload shape | +| `transformations` | supported (multi-stage) | not applicable (use `transformKql`) | + +For limits (payload size, rate limits), see [limits.md](./limits.md). diff --git a/plugin/skills/azure-data-collection-rules/references/kql-transforms.md b/plugin/skills/azure-data-collection-rules/references/kql-transforms.md new file mode 100644 index 000000000..8f0c9d356 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/kql-transforms.md @@ -0,0 +1,234 @@ +# KQL Transform Patterns for DCR Ingestion-Time Transformations + +KQL transforms in DCRs operate on a virtual table called `source`. The output of the query defines what gets ingested. + +## Syntax Rules + +- Always start with `source` (or `let` definitions before `source`) +- Use pipe (`|`) to chain operators +- Output columns must match the destination table schema (or be a subset) +- `TimeGenerated` (datetime) must be present in the final output — required for ALL LA tables. If the input lacks it, generate it (e.g., `| extend TimeGenerated = now()`) +- `let` statements are supported. Right-hand side can be a scalar expression, tabular expression, or user-defined function. Only user-defined functions with scalar arguments are supported. +- `parse` operator is preferred over `extract()` with regex when format is predictable (regex generation is error-prone) +- `parse` is limited to 10 columns per statement (chain multiple `parse` statements if needed) +- No cross-resource `join`, `union`, or `externaldata` +- `summarize` / aggregation not supported (use `aggregate.Basic` processor instead) +- `bag_keys()` not supported + +## Transform Pipeline Framework + +Standard order for building ingestion-time transforms. Skip steps that don't apply. + +``` +let definitions — 0. DEFINE: lookup tables, whitelists, helper functions +source + | 1. EARLY FILTER — drop on pre-existing fields (before any compute) + | 2. PARSE/EXTRACT — get structured fields from raw data + | 3. LATE FILTER — drop on parsed field values + | 4. ENRICH — add computed/classified columns, fill defaults + | 5. NORMALIZE — typecast, rename columns + | 6. PROJECT — explicit final column set matching destination + v +destination +``` + +### Mixed event types + +**Option A: DCR-level split (preferred)** — Multiple `dataFlows` from the same stream, each with its own `transformKql` and `outputStream`. Max 10 dataFlows per DCR. + +**Option B: KQL-level conditional** — Use `case()`/`iff()` inline when event types share a destination and most columns (best for 2-4 variants): + +```kql +source +| extend EventType = tostring(parse_json(RawData).type) +| extend + UserId = iff(EventType in ("auth", "session"), tostring(parse_json(RawData).userId), ""), + ResourcePath = iff(EventType == "access", tostring(parse_json(RawData).path), "") +| project TimeGenerated, EventType, UserId, ResourcePath +``` + +Use Option A for different destination tables/schemas. Use Option B for shared schema with minor variants. + +### Pipeline performance notes + +- `where` is the cheapest operator — push filtering as early as possible +- Use `_cs` (case-sensitive) variants of string operators when literal values have known casing (`startswith_cs`, `contains_cs`, `has_cs`) — avoids case-folding overhead +- `parse` is preferred over `extract()` for readability and reliability (no regex errors) +- `parse_json()` on every row is expensive; if you need only 1-2 fields from a large JSON, `parse` with string matching can be lighter +- `case()` short-circuits: put the most common branch first +- Combine related `extend` calls into one block (comma-separated) to reduce pipe overhead +- Go straight to `project` at the end; use inline renaming (`NewName = OldName`) in `project` to rename columns + +## Patterns by Pipeline Stage + +### Stage 0: Define (`let`) + +**Whitelist for filtering:** + +```kql +let allowedApps = datatable(app:string)["web-api", "auth-svc", "worker"]; +source +| where AppName in (allowedApps) +``` + +**Helper function (UDF with scalar args):** + +```kql +let classifySeverity = (level:int) { + case(level >= 6, "Critical", level >= 4, "High", level >= 2, "Medium", "Low") +}; +source +| extend Priority = classifySeverity(SeverityNumber) +``` + +### Stage 1: Early Filter + +```kql +source +| where SeverityLevel >= 4 +| where Computer !startswith_cs "test-" +``` + +**Filter using `let`-defined list:** + +```kql +let noisyApps = datatable(app:string)["healthcheck", "ping", "warmup"]; +source +| where AppName !in (noisyApps) +``` + +### Stage 2: Parse/Extract + +**Parse JSON:** + +```kql +source +| extend parsed = parse_json(RawData) +| extend + UserName = tostring(parsed.user), + Action = tostring(parsed.action), + StatusCode = toint(parsed.status) +``` + +**`parse` operator (preferred for predictable formats):** + +Use `*` to skip content before/between/after the target field. Each `parse` is independent of attribute order: + +```kql +source +| parse Message with * "User " UserName:string " performed " Action:string " on " Resource:string +``` + +Parse attributes independently (order-safe, each attribute optional): + +```kql +source +| parse Tags with * "region:" Region:string "," * +| parse Tags with * "env:" Environment:string "," * +| parse Tags with * "tier:" Tier:string +``` + +Chain for >10 columns: + +```kql +source +| parse Message with * "src=" SrcIP:string ":" SrcPort:int * " dst=" DstIP:string ":" DstPort:int * +| parse Message with * "proto=" Protocol:string " action=" ActionResult:string +``` + +**`extract()` with regex (when `parse` insufficient — variable key order, optional keys):** + +```kql +source +| extend + Region = extract(@'(?:^|;)region=([^;]*)', 1, Tags), + Environment = extract(@'(?:^|;)env=([^;]*)', 1, Tags) +``` + +### Stage 3: Late Filter + +Same as Stage 1, but filtering on fields produced by Stage 2: + +```kql +| where StatusCode != 200 +``` + +### Stage 4: Enrich + +**Computed classification:** + +```kql +| extend Environment = case( + Computer startswith_cs "prod-", "Production", + Computer startswith_cs "dev-", "Development", + "Unknown" + ) +``` + +**Default values:** + +```kql +| extend Region = iif(isempty(Region), "Unknown", Region) +| extend TimeGenerated = iif(isnull(TimeGenerated), now(), TimeGenerated) +``` + +**Enrich from `let`-defined lookup (use UDF + `case()`):** + +```kql +let mapSeverity = (code:int) { + case(code == 4, "Critical", code == 3, "High", code == 2, "Medium", code == 1, "Low", "Unknown") +}; +source +| extend SeverityLabel = mapSeverity(SeverityCode) +``` + +### Stage 5: Normalize + +```kql +| extend StatusCode = toint(StatusCode), Duration = toreal(DurationMs) +``` + +Renaming is handled in the final `project` step (e.g., `| project HostName = Computer, ...`). + +### Stage 6: Project + +Explicit final column set. Use inline renaming (`NewCol = OldCol`). Must match destination schema for custom tables. + +```kql +| project TimeGenerated, HostName = Computer, UserName, Action, StatusCode +``` + +### Misc Patterns + +**Pass-through (no transform):** + +```kql +source +``` + +**TimeGenerated generation (when source lacks it):** + +```kql +source +| extend TimeGenerated = now() +``` + +**Full pipeline example (all stages):** + +```kql +let allowedApps = datatable(app:string)["web-api", "auth-svc", "worker"]; +source +| where Facility != "kern" +| parse SyslogMessage with * "app=" AppName:string " " * +| parse SyslogMessage with * "status=" Status:int " " * +| parse SyslogMessage with * "dur=" Duration:real +| where AppName in (allowedApps) +| extend + StatusBucket = case(Status < 400, "OK", Status < 500, "ClientErr", "ServerErr") +| project TimeGenerated, HostName = Computer, AppName, StatusCode = Status, StatusBucket, DurationMs = Duration +``` + + +## Supported Features Reference + +Full list of supported operators, functions, and statements: [Supported KQL features in Azure Monitor transformations](https://learn.microsoft.com/en-us/azure/azure-monitor/data-collection/data-collection-transformations-kql) diff --git a/plugin/skills/azure-data-collection-rules/references/la-tables.md b/plugin/skills/azure-data-collection-rules/references/la-tables.md new file mode 100644 index 000000000..86b672f6a --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/la-tables.md @@ -0,0 +1,76 @@ +# Log Analytics Tables Reference + +## Standard Tables + +Standard tables have predefined schemas and are referenced via `Microsoft-*` streams. Data landing in standard tables must match the table schema. + +Common standard tables: + +| Table | Stream | Use | +|-------|--------|-----| +| Syslog | Microsoft-Syslog | Linux syslog | +| Event | Microsoft-Event | Windows events | +| Perf | Microsoft-Perf | Performance counters | +| InsightsMetrics | Microsoft-InsightsMetrics | Performance metrics | +| W3CIISLog | Microsoft-W3CIISLog | IIS logs | +| CommonSecurityLog | Microsoft-CommonSecurityLog | CEF security logs | + +## Custom Tables + +Custom tables have user-defined schemas and names ending in `_CL`. They must be created in the LA workspace before data can be ingested. + +### Creating a Custom Table via REST API + +``` +PUT https://management.azure.com/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{workspace}/tables/{tableName}_CL?api-version=2022-10-01 +Content-Type: application/json + +{ + "properties": { + "schema": { + "name": "{tableName}_CL", + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "MyColumn", "type": "string" }, + { "name": "Count", "type": "int" } + ] + }, + "retentionInDays": 30, + "totalRetentionInDays": 90, + "plan": "Analytics" + } +} +``` + +### Table Plans + +| Plan | Description | Use Case | +|------|-------------|----------| +| `Analytics` | Full query, alerting, dashboards | Primary operational data | +| `Basic` | Limited query, lower cost | High-volume, infrequent access | +| `Auxiliary` | Lowest cost, limited features | Compliance, long retention | + +### Column Types + +Same as stream declarations: `string`, `int`, `long`, `real`, `boolean`, `dynamic`, `datetime` + +### Rules + +- `TimeGenerated` (datetime) is required in every custom table +- Table name must end with `_CL` +- Column names are case-sensitive +- Schema changes (adding columns) can be done via PUT with the updated schema +- Removing columns requires recreating the table +- The table schema and the custom stream declaration schema should match + +### Getting Table Schema + +```powershell +$path = "/subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.OperationalInsights/workspaces/{workspace}/tables/{tableName}?api-version=2022-10-01" +$response = Invoke-AzRestMethod -Path $path -Method GET +($response.Content | ConvertFrom-Json).properties.schema.columns +``` + +### PowerShell Helper + +Use [create-custom-table.ps1](../scripts/create-custom-table.ps1) and [get-table-schema.ps1](../scripts/get-table-schema.ps1). diff --git a/plugin/skills/azure-data-collection-rules/references/limits.md b/plugin/skills/azure-data-collection-rules/references/limits.md new file mode 100644 index 000000000..af98304ff --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/limits.md @@ -0,0 +1,73 @@ +# DCR Limits and Constraints + +Documented limits from Azure Monitor service-limits page plus related Log Analytics workspace and Logs Ingestion API limits relevant to DCR authoring. + +Source: [Azure Monitor service limits](https://learn.microsoft.com/en-us/azure/azure-monitor/service-limits) + +## DCR Structure Limits + +| Element | Limit | Source | +|---|---|---| +| Data sources per DCR | 10 | [Service limits](https://learn.microsoft.com/en-us/azure/azure-monitor/service-limits#data-collection-rules) | +| Data flows per DCR | 10 | Service limits | +| Data streams per DCR | 20 | Service limits | +| Extensions per DCR | 10 | Service limits | +| Extension settings size | 32 KB | Service limits | +| Log Analytics workspace destinations per DCR | 10 | Service limits | +| Characters in a transformation (transformKql) | 15,360 | Service limits | +| Counter specifiers per performance counter data source | 100 | Service limits | +| Facility names per Syslog data source | 20 | Service limits | +| XPath queries per Windows Event Log data source | 100 | Service limits | + +## DCR Resource-Level Limits + +| Element | Limit | Source | +|---|---|---| +| DCR resource name max length | 260 chars | ARM resource name limit | +| DCR name for kind "Direct" (used as DNS label) | 3-30 chars, alphanumeric + hyphens only | DCR API validation (Direct kind requires DNS-safe name) | +| Disallowed characters in DCR resource name | `<>%&:\?/` | ARM resource name validation | +| DCR associations (DCRAs) per resource | 30 | [DCR overview](https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/data-collection-rule-overview) | + +## Log Analytics Workspace Limits (Destination) + +| Element | Limit | Source | +|---|---|---| +| Columns per table | 500 | [General workspace limits](https://learn.microsoft.com/en-us/azure/azure-monitor/service-limits#general-workspace-limits) | +| Column name length | 2-45 chars | General workspace limits + custom table docs | +| Custom log tables per workspace | 500 | General workspace limits | + +## Column Name Rules (Custom Tables) + +- Must start with a letter (A-Z or a-z) +- After first char: letters, digits, or underscores only +- No spaces, dots, dashes, or other punctuation +- Non-ASCII letters not supported +- Custom columns in Azure tables must end in `_CF` +- Reserved names: `id`, `BilledSize`, `IsBillable`, `InvalidTimeGenerated`, `TenantId`, `Title`, `Type`, `UniqueId`, `_ItemId`, `_ResourceGroup`, `_ResourceId`, `_SubscriptionId`, `_TimeReceived` + +Source: [Add or delete tables and columns](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/create-custom-table) + +## Logs Ingestion API Limits + +| Element | Limit | Source | +|---|---|---| +| API call body size | 1 MB (compressed or uncompressed) | [Logs Ingestion API limits](https://learn.microsoft.com/en-us/azure/azure-monitor/service-limits#logs-ingestion-api) | +| Field value max size | 64 KB (truncated if exceeded) | Logs Ingestion API limits | +| Data per minute per DCR | 2 GB (soft, auto-scales) | Logs Ingestion API limits | +| Requests per minute per DCR | 12,000 (soft, auto-scales) | Logs Ingestion API limits | +| TimeGenerated range per API call (Auxiliary tables) | 30 minutes | Logs Ingestion API limits | + +## Stream Declaration Constraints + +- Custom stream names must begin with `Custom-` +- Supported column types: `string`, `int`, `long`, `real`, `boolean`, `dynamic`, `datetime` +- `guid` type not available in stream declarations (use `string`) +- Every table must have a `TimeGenerated` column (auto-added by transform if missing) + +## Data Flow Constraints + +- One stream can only send to one Log Analytics workspace in a single DCR +- Multiple dataFlow entries allowed for same stream if targeting different tables in the same workspace +- To send one stream to multiple workspaces, create separate DCRs + + diff --git a/plugin/skills/azure-data-collection-rules/references/procedure.md b/plugin/skills/azure-data-collection-rules/references/procedure.md new file mode 100644 index 000000000..18125e99a --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/procedure.md @@ -0,0 +1,101 @@ +# DCR Authoring Procedure + +Full step-by-step workflow for creating or modifying a Data Collection Rule. + +## Step 1: Gather Requirements + +Collect from the user: + +1. **New or existing DCR?** If existing, retrieve via [get-dcr.ps1](../scripts/get-dcr.ps1) and ask what modifications they want. If new, proceed. +2. **Ingestion method:** agent-based (AMA on VM) or direct ingestion (Log Ingestion API from app/script)? +3. **Data source type(s):** syslog, windowsEventLogs, performanceCounters, logFiles (JSON/text), iisLogs (agent-based), or custom JSON payload (direct) +4. **What the user wants to achieve** (in their own words): e.g., "filter noisy logs", "extract fields from JSON", "reduce volume", "drop PII", "aggregate counters". Do NOT require the user to specify processors or stages, but inform them that it is possible to assign certain processors manually if needed. +5. **Destination:** standard table (e.g., Syslog, Event) or custom table (`*_CL`) +6. **Split / copy:** does the same data need to go to multiple tables (copy), or different subsets to different tables (split)? +7. **Target subscription, resource group, workspace resource ID**. + +## Step 2: Determine DCR Kind + +Consult the [DCR kinds guide](./dcr-kinds.md) to select `Linux`, `Windows`, `Direct`, or `WorkspaceTransforms` based on the ingestion method from Step 1. This determines what transformation capabilities are available in Step 3. + +## Step 3: Design Transformation Pipeline + +Using the DCR kind from Step 2 and the user's intent from Step 1, follow the design order in the [DCR kinds guide](./dcr-kinds.md) to build the transformation pipeline. For agent-based processor selection, consult [processor heuristics](./processor-heuristics.md). + +Present the recommended pipeline to the user for confirmation: + +``` +Recommended pipeline: + Native filters: logLevels = ["Warning", "Error", "Critical", "Alert", "Emergency"] + Client-side: header.Syslog → filter.Basic (Message contains "failed") → map.Drop (ProcessId, HostIP) + Ingestion-side: (none needed) + Destination: Microsoft-Syslog → Syslog table +``` + +If the user's intent doesn't map cleanly to a heuristic, fall back to asking which processors they want. + +## Step 4: Design and Author the DCR + +1. **Identify streams and destination table** — consult [destination routing rules](./destination-routing.md): + - Standard stream (`Microsoft-*`) if output matches standard table schema + - Custom stream (`Custom-*`) if schema changes or routing to a custom table + - Whether the target standard table accepts custom streams (see supported table list) +2. **Define custom stream schemas** in `streamDeclarations` if needed +3. **Author data sources** with collection settings and optional client-side `transform` reference +4. **Author transformations** in the `transformations` section (multi-stage) or use `transformKql` inline (single-stage) +5. **Define data flows** routing streams to destinations with optional ingestion-side transforms +6. **Define destinations** with workspace resource ID + +**If split or copy is involved:** consult [Split/Copy Cost Optimization](./processor-heuristics-staging.md#splitcopy-cost-optimization). Key rules: +- All destinations Analytics: pipeline-side preferred (transforms free, single network send) +- Any destination Auxiliary: evaluate per-flow processing charges vs. network cost +- Copy + split combined: MUST use pipeline-side (client-side filtering breaks the copy path) +- Multi-workspace: AMA-only. Prefer separate DCRs per workspace. Single DCR with multiple destinations is possible but only use if the user explicitly requests it. + +**Output the final DCR JSON** per [DCR schema](./dcr-schema.md). See [example DCRs](../examples/) for reference. + +## Step 5: Validate + +Before deploying, validate: + +1. Every `transform` reference in dataSources/dataFlows points to a named transformation in `transformations` +2. Every custom stream in `streams` arrays has a matching `streamDeclarations` entry +3. Every destination referenced in `dataFlows` exists in `destinations` +4. `transform` and `transformKql` are not both present on the same data flow +5. Header processors match their context (client-side headers for data sources, ingestion-side headers for data flows) +6. Output schema of processors is compatible with the target stream/table +7. **Limits compliance** (see [limits.md](./limits.md)): + - Data sources ≤ 10, data flows ≤ 10, streams ≤ 20, LA destinations ≤ 10 + - `transformKql` ≤ 15,360 characters + - Perf counter specifiers ≤ 100, syslog facilities ≤ 20, xPathQueries ≤ 100 + - Stream column names: start with letter, alphanumeric + underscore only, ≤ 60 chars, ≤ 1,000 columns + - Column types: only `string`, `int`, `long`, `real`, `boolean`, `dynamic`, `datetime` (no `guid`) + - Direct DCR name: 3–30 chars, alphanumeric + hyphens (DNS-safe) + +Run [validate-dcr.ps1](../scripts/validate-dcr.ps1) for automated checks (includes all limits validation). + +## Step 6: Deploy and Prepare Tables + +Deploy with [put-dcr.ps1](../scripts/put-dcr.ps1): + +```powershell +.\put-dcr.ps1 -SubscriptionId "{sub}" -ResourceGroupName "{rg}" -DcrName "{name}" -DcrFilePath "dcr.json" +``` + +Use [get-dcr.ps1](../scripts/get-dcr.ps1) to retrieve an existing DCR for editing. + +**Prepare destination tables** before deploying the DCR: + +1. **Custom tables:** create with [create-custom-table.ps1](../scripts/create-custom-table.ps1). See [LA tables](./la-tables.md). +2. **Standard tables with extra columns:** add them with `_CF` suffix before deploying. +3. **Schema verification:** compare transform output against table schema ([get-table-schema.ps1](../scripts/get-table-schema.ps1)). Mismatches cause silent data loss. + +See [destination routing](./destination-routing.md) (rules 8-9) for details. + +## Step 7: Verify + +After deployment: + +1. Check DCR validation status in Azure portal or via GET on the DCR resource +2. Query the destination table in Log Analytics to confirm data arrives +3. Check for ingestion errors in the `_LogOperation` table diff --git a/plugin/skills/azure-data-collection-rules/references/processor-heuristics-filters.md b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-filters.md new file mode 100644 index 000000000..2bc3aceb0 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-filters.md @@ -0,0 +1,38 @@ +# Processor Heuristics: Native Filters & Filtering + +Match user intent to the correct filtering approach. **Always prefer native filtering** over processors when possible. + +## Native Filter Check (apply BEFORE processor selection) + +| Data source type | Native filter parameters | What they can filter | Use `filter.Basic` only when... | +|---|---|---|---| +| `syslog` | `facilityNames`, `logLevels` | Severity levels, syslog facilities | Filtering on non-severity/facility fields (e.g., ProcessName, HostName, Message content) | +| `windowsEventLogs` | `xPathQueries` | Channel, EventID, Level, Provider, Keywords, any `System/*` field | Filtering on EventData content not expressible in XPath, or complex cross-field conditions spanning System and EventData | +| `performanceCounters` | `counterSpecifiers` | Which counters to collect (by object/counter/instance path) | Filtering by counter value thresholds (e.g., CPU > 80%) | +| `logFiles` | `filePatterns` | Which files to ingest (by glob pattern) | Any content-based filtering (always needed since native filter is file-level only) | +| `iisLogs` | `logDirectories` | Which directories to read | Any content-based filtering (status codes, URIs, response times, etc.) | + +**Examples of native filter vs. processor:** + +| User intent | Data source | Correct approach | Wrong approach | +|---|---|---|---| +| "Only keep Warning and above syslog" | `syslog` | Set `logLevels: ["Warning", "Error", "Critical", "Alert", "Emergency"]` | `filter.Basic` on SeverityNumber | +| "Only collect Security event IDs 4624, 4625" | `windowsEventLogs` | `xPathQueries: ["Security!*[System[(EventID=4624 or EventID=4625)]]"]` | `filter.Basic` on EventNumber | +| "Only System events Level 1-3" | `windowsEventLogs` | `xPathQueries: ["System!*[System[(Level=1 or Level=2 or Level=3)]]"]` | `filter.Basic` on EventLevel | +| "Only collect CPU and memory counters" | `performanceCounters` | `counterSpecifiers: ["\\Processor(_Total)\\% Processor Time", "\\Memory\\Available MBytes"]` | `filter.Basic` on CounterName | +| "Syslog from auth facility where message contains 'failed'" | `syslog` | `facilityNames: ["auth"]` (native) + `filter.Basic` on Message contains "failed" (processor) | Either all-native (can't filter message content) or all-processor (wasteful) | + +## Filtering Intent-to-Processor Map + +> **Pre-check:** Before recommending `filter.Basic`, verify the intent cannot be satisfied by native data source filters above. + +| User says / scenario | Recommended approach | Stage | Rationale | +|---|---|---|---| +| "only keep errors/warnings" (syslog) | **Native:** set `logLevels` | N/A | No processor needed | +| "only certain event IDs" (Windows) | **Native:** set `xPathQueries` with EventID filter | N/A | No processor needed | +| "only certain event levels" (Windows) | **Native:** set `xPathQueries` with Level filter | N/A | No processor needed | +| "only collect specific counters" | **Native:** set `counterSpecifiers` | N/A | No processor needed | +| "reduce volume", "drop noisy logs" (on fields not covered by native filters) | `filter.Basic` | Client-side only | Filter early to save network and ingestion cost | +| "filter after enrichment", "filter by resolved hostname" | `transform.KQL` with `where` | Ingestion-side | `filter.Basic` is client-side only; use KQL for ingestion-side filtering | +| "filter by message content", "keep logs matching pattern" | `filter.Basic` (if `contains`/`==` suffices) or `transform.KQL` (if regex needed) | Client-side / Ingestion-side respectively | `filter.Basic` is client-side only; `transform.KQL` is ingestion-side only | +| "complex filter with string functions" | `transform.KQL` | Ingestion-side | Only `transform.KQL` is available in the pipeline | diff --git a/plugin/skills/azure-data-collection-rules/references/processor-heuristics-staging.md b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-staging.md new file mode 100644 index 000000000..b6a246aeb --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-staging.md @@ -0,0 +1,71 @@ +# Processor Heuristics: Staging & Cost Optimization + +Stage placement rules, split/copy billing optimization, and common multi-processor chains. + +## Stage Decision Heuristics + +When the user doesn't specify a stage, apply these rules in order: + +1. **Security-sensitive data (PII, secrets)?** → Client-side `map.Drop` or `filter.Basic` (data never leaves the VM) +2. **Split/copy scenario?** → Consult "Split/Copy Cost Optimization" below. Do NOT filter client-side if a copy path exists from the same data source. +3. **Volume reduction possible early?** → Client-side (saves network, ingestion, and storage cost) +4. **Requires KQL string functions, regex, or computed columns?** → Ingestion-side `transform.KQL` (the only non-header processor available in the pipeline) +5. **Depends on standard table enrichment (e.g., _ResourceId)?** → Ingestion-side `transform.KQL` +6. **All non-KQL processors are client-side only.** Only `transform.KQL` (+ `header.StandardStream`/`header.CustomStream`) runs in the pipeline. +7. **Need to combine with existing `transformKql`?** → Keep in ingestion-side for consistency + +## Split/Copy Cost Optimization + +**Key billing facts:** +- **Analytics/Basic destinations:** Transformations are free (no processing charge regardless of how much data is filtered or added). +- **Auxiliary destinations:** Processing charge on ALL incoming data to the flow (full input volume), regardless of how much is dropped. Plus ingestion charge on output volume. +- **Client-side split (multiple data sources):** Each data source sends its subset independently. Same data may be sent multiple times in overlapping/copy scenarios. +- **Pipeline-side split (multiple dataFlows):** Each flow processes the full input stream. For N flows, processing charges are incurred N times on the input volume (relevant for Auxiliary). +- **Multi-workspace:** Only possible via AMA double-ingestion (separate data sources targeting different workspaces). Not available for direct ingestion DCRs. + +**Optimal strategy by scenario:** + +| Scenario | Optimal approach | Rationale | +|---|---|---| +| Copy → multiple Analytics tables | Pipeline-side (multiple dataFlows, same stream) | Transformations free. Single network send. | +| Copy → Analytics + Auxiliary | Pipeline-side | Auxiliary processing charge unavoidable. Single send saves network vs. double-ingestion. | +| Split → multiple Analytics tables | Pipeline-side | Transformations free regardless of drop ratio. Single network send. | +| Split → multiple Auxiliary tables | Client-side (separate data sources with native filters) | Each source sends only its subset, reducing per-flow processing volume. If native filters can't separate cleanly, pipeline-side is the only option (accept multiplied processing charges). | +| Copy + Split combined (full copy → Auxiliary, filtered subset → Analytics) | Pipeline-side only | Cannot filter client-side (would break the copy). Accept Auxiliary processing charge on full volume. Analytics flow is free. | +| Multi-workspace | AMA double-ingestion (separate data sources) | Only possible in AMA DCRs. Each workspace requires its own data source entry. | + +**Critical rule:** When a copy path and a split path share the same data source, do NOT apply client-side filtering. Client-side filters reduce data before it reaches ALL streams, breaking the copy. Place split filters on the ingestion-side dataFlow only. + +## Common Multi-Processor Chains + +### Syslog: filter by message content + drop columns +``` +Native: logLevels = ["Warning", "Error", "Critical", "Alert", "Emergency"] (severity filtering) +Client-side: header.Syslog → filter.Basic (Message contains "authentication") → map.Drop (ProcessId, HostIP) +``` +Stage: client-side. Rationale: native `logLevels` handles severity; processor handles content filter and column drop. + +### Windows Events: parse XML + extract fields + drop raw +``` +header.WindowsEvents → parse.XmlPath (extract EventID, UserName from RawXml) → map.Drop (remove RawXml, RenderingInfo) +``` +Stage: client-side. Rationale: extract needed fields, drop large raw XML. + +### Text logs: parse JSON + rename + ingestion KQL +``` +Client: header.TextLog → parse.JsonPath (extract structured fields from RawData) → map.Drop (RawData, FilePath) +Ingestion: header.CustomStream → transform.KQL (extend computed columns, filter) +``` +Two-stage chain. Rationale: parse on VM, compute on ingestion. + +### Performance counters: aggregate +``` +header.WindowsPerformanceCounters → aggregate.Basic (5m window, avg/max/count by Host, CounterName) +``` +Stage: client-side. Rationale: dramatic volume reduction. Must route to custom table. + +### CEF security logs: parse + enrich +``` +header.Syslog → parse.CEFAttribute (extract deviceAction, sourceAddress, destinationAddress) → enrich.DNSLookup (resolve sourceAddress) +``` +Stage: client-side. Rationale: extract and enrich before send. diff --git a/plugin/skills/azure-data-collection-rules/references/processor-heuristics-transforms.md b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-transforms.md new file mode 100644 index 000000000..6fff8c405 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/processor-heuristics-transforms.md @@ -0,0 +1,66 @@ +# Processor Heuristics: Transforms & Intent Mapping + +Intent-to-processor mapping for parsing, schema modification, aggregation, enrichment, and data routing. + +## Parsing / Field Extraction + +| User says / scenario | Data signal | Recommended processor | Stage | +|---|---|---|---| +| "extract fields from JSON", "parse JSON" | Column contains `{...}` JSON | `parse.JsonPath` | Client-side only | +| "extract from XML", "parse event XML" | Column contains `<...>` XML (e.g., `RawXml` in Windows events) | `parse.XmlPath` | Client-side only | +| "parse CEF", "extract CEF attributes" | Syslog message in CEF format | `parse.CEFAttribute` | Client-side only | +| "extract with regex", "parse custom format" | Unstructured text, no standard format | `transform.KQL` with `extract()` | Ingestion-side only | +| "parse key-value pairs" (JSON object) | Column contains `{"k":"v", ...}` | `parse.JsonPath` (agent, client-side only) or `transform.KQL` with `parse_json()` (ingestion-side) | Client-side for agent-based; ingestion-side for direct ingestion | +| "parse key-value pairs" (delimited string) | Column contains `k=v;k=v` or `k:v, k:v` or similar | `transform.KQL` with `extract()` or `parse` (max 5 vars) | Ingestion-side only. See [KQL KV extraction patterns](./kql-transforms.md#extract-value-from-delimited-key-value-string) | + +## Schema Modification + +| User says / scenario | Recommended processor | Stage | Notes | +|---|---|---|---| +| "rename column", "change column name" | `map.Rename` | Client-side only | +| "change column type", "cast to int" | `map.Rename` (with `typeAs`) | Client-side only | +| "drop columns", "remove PII columns", "strip sensitive fields" | `map.Drop` | Client-side only (data never leaves VM) | +| "add computed column", "calculate new field" | `transform.KQL` with `extend` | Ingestion-side only | +| "keep only specific columns" | `transform.KQL` with `project` | Ingestion-side only; or `map.Drop` (client-side only) for exclusion-based approach | + +## Aggregation / Summarization + +| User says / scenario | Recommended processor | Stage | Notes | +|---|---|---|---| +| "aggregate", "summarize", "count events per host" | `aggregate.Basic` | Client-side only (dramatic volume reduction) | +| "average over time window", "min/max per counter" | `aggregate.Basic` | Client-side only | +| "aggregate after enrichment" | Not supported | N/A | `aggregate.Basic` is client-side only; if aggregation depends on enriched fields only available ingestion-side, aggregation is not possible | + +**Always** route aggregated output to a custom table (schema changes entirely). + +## Enrichment + +| User says / scenario | Recommended processor | Stage | Notes | +|---|---|---|---| +| "resolve IP to hostname", "DNS lookup" | `enrich.DNSLookup` | Client-side only (has network access to DNS) | +| "add environment tag", "classify by hostname pattern" | `transform.KQL` with `case`/`extend` | Ingestion-side only | +| "look up geo-location" | Not available as processor; use `transform.KQL` if geo functions supported | Ingestion-side | + +## Data Routing / Split / Copy + +| User says / scenario | Recommended approach | Notes | +|---|---|---| +| "send to multiple tables" | Multiple `dataFlows` with different `outputStream` | No processor needed; DCR structure handles this | +| "split by severity" | Multiple `dataFlows` with different `transformKql` filters | One flow per destination table | +| "different transforms per destination" | Multiple `dataFlows` consuming same stream | Each flow can have its own transform | +| "send to multiple workspaces" | Separate data sources in AMA DCR (double-ingestion) | Not possible in pipeline or direct DCRs | + +## Data Source to Header Mapping + +| Data source type | Header processor | Notes | +|---|---|---| +| `syslog` | `header.Syslog` | | +| `windowsEventLogs` | `header.WindowsEvents` | | +| `performanceCounters` (Windows) | `header.WindowsPerformanceCounters` | Cannot mix with Linux in same DCR | +| `performanceCounters` (Linux) | `header.LinuxPerformanceCounters` | Cannot mix with Windows in same DCR | +| `logFiles` (text) | `header.TextLog` | | +| `logFiles` (json) | `header.TextLog` | JSON format still uses TextLog header | +| `iisLogs` | `header.IISLog` | | +| Windows Firewall | `header.WindowsFirewallLog` | | +| Ingestion-side (standard stream) | `header.StandardStream` | Requires `streamId` config | +| Ingestion-side (custom stream) | `header.CustomStream` | Requires `streamId` config | diff --git a/plugin/skills/azure-data-collection-rules/references/processors-headers.md b/plugin/skills/azure-data-collection-rules/references/processors-headers.md new file mode 100644 index 000000000..bc4ab302c --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/processors-headers.md @@ -0,0 +1,92 @@ +# Processor Reference: Headers & Stage Availability + +Processors are the building blocks of multi-stage transformations. Each processor has a `processor` name (format: `family.Name`) and a `configuration` object. + +```jsonc +{ + "processor": "family.Name", + "configuration": { } +} +``` + +## Stage Availability + +| Processor | Client-side | Ingestion-side | +|-----------|:-----------:|:--------------:| +| header.Syslog | Yes | No | +| header.WindowsEvents | Yes | No | +| header.WindowsPerformanceCounters | Yes | No | +| header.LinuxPerformanceCounters | Yes | No | +| header.TextLog | Yes | No | +| header.IISLog | Yes | No | +| header.WindowsFirewallLog | Yes | No | +| header.StandardStream | No | Yes | +| header.CustomStream | No | Yes | +| filter.Basic | Yes | No | +| map.Rename | Yes | No | +| map.Drop | Yes | No | +| parse.JsonPath | Yes | No | +| parse.XmlPath | Yes | No | +| parse.CEFAttribute | Yes | No | +| aggregate.Basic | Yes | No | +| enrich.DNSLookup | Yes | No | +| transform.KQL | No | Yes | + +## Header Processors + +Must be the first processor in every transformation. Converts raw data into a schematized tabular format. + +Header processors that take no configuration (all except `header.StandardStream` and `header.CustomStream`) should omit the `configuration` property entirely. Do not include `"configuration": {}`. + +### header.Syslog +For syslog data sources. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated` (datetime), `Facility` (string), `SeverityNumber` (int), `EventTime` (datetime), `HostIP` (string), `Message` (string), `ProcessId` (string), `Severity` (string), `Host` (string), `ident` (string), `Timestamp` (datetime) + +### header.WindowsEvents +For Windows event log data sources. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated` (datetime), `TimeCreated` (datetime), `PublisherId` (string), `PublisherName` (string), `Channel` (string), `LoggingComputer` (string), `EventNumber` (int), `EventCategory` (int), `EventLevel` (string), `UserName` (string), `RawXml` (string), `EventDescription` (string), `RenderingInfo` (string), `EventRecordId` (int) + +### header.WindowsPerformanceCounters +For Windows perf counter data sources. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated` (datetime), `CounterName` (string), `CounterValue` (real), `SampleRate` (int), `Counter` (string), `Instance` (string) + +### header.LinuxPerformanceCounters +For Linux perf counter data sources. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated` (datetime), `Timestamp` (datetime), `CounterName` (string), `ObjectName` (string), `InstanceName` (string), `Value` (int), `Host` (string) + +### header.TextLog +For custom text log files. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated` (datetime), `FilePath` (string), `RawData` (string), `Computer` (string) + +### header.IISLog +For IIS log data sources. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated`, `s_sitename`, `s_computername`, `s_ip`, `cs_method`, `cs_uri_stem`, `cs_uri_query`, `s_port`, `cs_username`, `c_ip`, `cs_version`, `cs_User_Agent_`, `cs_Cookie_`, `cs_Referer_`, `cs_host`, `sc_status`, `sc_substatus`, `sc_win32_status`, `sc_bytes`, `cs_bytes`, `time_taken` + +### header.WindowsFirewallLog +For Windows Firewall logs. No configuration needed — omit `configuration`. + +Output columns: `TimeGenerated`, `date`, `time`, `action`, `protocol`, `src_ip`, `dst_ip`, `src_port`, `dst_port`, `size`, `tcpflags`, `tcpsyn`, `tcpack`, `tcpwin`, `icmptype`, `icmpcode`, `info`, `path`, `pid` + +### header.StandardStream +For ingestion-side transforms on standard streams. + +```jsonc +{ "streamId": "Microsoft-Syslog" } +``` + +Output schema matches the standard LA table. + +### header.CustomStream +For ingestion-side transforms on custom streams. + +```jsonc +{ "streamId": "Custom-MyStream" } +``` + +Output schema matches the `streamDeclarations` definition. diff --git a/plugin/skills/azure-data-collection-rules/references/processors-operations.md b/plugin/skills/azure-data-collection-rules/references/processors-operations.md new file mode 100644 index 000000000..912ae2b9f --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/processors-operations.md @@ -0,0 +1,124 @@ +# Processor Reference: Filter, Map, Parse, Aggregate, Enrich, KQL + +## filter.Basic + +Drops records based on conditions. Structure: OR groups of AND groups. + +```jsonc +{ + "any": [ + { + "all": [ + { "columnName": "Facility", "operator": "==", "value": "auth" } + ] + } + ] +} +``` + +Record is **kept** if any AND group evaluates to true. + +**Operators:** +- String: `==`, `!=`, `contains`, `!contains` +- Numeric: `==`, `!=`, `>`, `<`, `>=`, `<=` + +Output: same schema, fewer records. + +## map.Rename + +```jsonc +{ + "all": [ + { "columnName": "OldName", "nameAs": "NewName", "typeAs": "string" } + ] +} +``` + +`nameAs` and `typeAs` are both optional (at least one required). Types: `string`, `int`, `long`, `real`, `bool`, `datetime`. Failed casts produce `null`. + +**Limitations:** +- `nameAs` is required by the API even for type-only casts — set `nameAs` to a new column name +- Cannot rename a column to its own name (API rejects duplicate column names) +- For in-place type casts without renaming, use `transform.KQL` instead: `source | extend Col = toint(Col)` + +## map.Drop + +```jsonc +{ "columnNames": ["Column1", "Column2"] } +``` + +## parse.JsonPath + +```jsonc +{ + "columnName": "EventData", + "all": [ + { "path": "$.user.name", "nameAs": "UserName", "typeAs": "string" } + ] +} +``` + +## parse.XmlPath + +```jsonc +{ + "columnName": "RawXml", + "all": [ + { "path": "/Event/System/EventID", "nameAs": "EventID", "typeAs": "int" } + ] +} +``` + +Supports simple XPath including attribute selectors: `/Event/EventData/Data[@Name='SubjectUserName']` + +## parse.CEFAttribute + +```jsonc +{ + "columnName": "Message", + "all": [ + { "path": "deviceAction", "nameAs": "Action", "typeAs": "string" } + ] +} +``` + +All parse processors add new columns to the schema. `typeAs` is optional; failed casts produce `null`. + +## aggregate.Basic + +```jsonc +{ + "batchingSettings": { + "timeWindow": "5m", + "maxBatchRows": 1000 + }, + "aggregates": [ + { "columnName": "CounterValue", "operator": "avg", "nameAs": "AvgValue" }, + { "operator": "count", "nameAs": "RecordCount" } + ], + "dimensionColumns": ["Host", "CounterName"] +} +``` + +**Operators:** `sum`, `avg`, `min`, `max`, `count` (`columnName` not required for `count`). +**dimensionColumns:** group-by columns (string type only). + +Output schema contains ONLY aggregate columns + dimension columns. Route aggregated data to a custom table. + +## enrich.DNSLookup + +```jsonc +{ "columnName": "IPAddress", "nameAs": "DNSName" } +``` + +Best-effort DNS resolution. Returns `null` on lookup failure. + +## transform.KQL + +Ingestion-side only. Arbitrary KQL expression. + +```jsonc +{ "expression": "source | where SeverityNumber >= 4 | extend EnrichedMsg = strcat(Host, ': ', Message)" } +``` + +Output schema determined by the KQL expression. Limited static validation. diff --git a/plugin/skills/azure-data-collection-rules/references/stream-declarations.md b/plugin/skills/azure-data-collection-rules/references/stream-declarations.md new file mode 100644 index 000000000..c0c4e37d2 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/stream-declarations.md @@ -0,0 +1,21 @@ +# Stream Declarations + +Defines schemas for custom streams in `streamDeclarations`. Keys must begin with `Custom-`. +Standard streams (`Microsoft-*`) have implicit schemas and need no declaration. + +**Critical: Transform-derived streams.** When an agent-based data source (syslog, windowsEventLogs, etc.) has a `transform` reference and the transform modifies the schema (e.g., via `map.Drop`, `map.Rename`, `parse.*`), the output stream must be `Custom-*`. However, this custom stream's schema is **implicitly derived** from the transform's processor chain output. Do NOT declare it in `streamDeclarations`. Only `logFiles` data sources may declare their custom streams here. For direct ingestion (`kind: "Direct"`), custom streams MUST be declared in `streamDeclarations`. + +```jsonc +"streamDeclarations": { + "Custom-MyStream": { + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "MyColumn", "type": "string" } + ] + } +} +``` + +**Column types:** `string`, `int`, `long`, `real`, `boolean`, `dynamic`, `datetime` + +Column constraints (name length, max fields, reserved names) are in [DCR schema](./dcr-schema.md#column-constraints). diff --git a/plugin/skills/azure-data-collection-rules/references/supported-tables.json b/plugin/skills/azure-data-collection-rules/references/supported-tables.json new file mode 100644 index 000000000..753132a49 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/supported-tables.json @@ -0,0 +1,119 @@ +[ + "ABAPAuditLog", + "ABAPAuthorizationDetails", + "ABAPChangeDocsLog", + "ABAPUserDetails", + "ADAssessmentRecommendation", + "ADSecurityAssessmentRecommendation", + "Anomalies", + "ASimAuditEventLogs", + "ASimAuthenticationEventLogs", + "ASimDhcpEventLogs", + "ASimDnsActivityLogs", + "ASimFileEventLogs", + "ASimNetworkSessionLogs", + "ASimProcessEventLogs", + "ASimRegistryEventLogs", + "ASimUserManagementActivityLogs", + "ASimWebSessionLogs", + "AWSALBAccessLogs", + "AWSCloudTrail", + "AWSCloudWatch", + "AWSEKS", + "AWSELBFlowLogs", + "AWSGuardDuty", + "AWSNetworkFirewallAlert", + "AWSNetworkFirewallFlow", + "AWSNetworkFirewallTls", + "AWSNLBAccessLogs", + "AWSRoute53Resolver", + "AWSS3ServerAccess", + "AWSSecurityHubFindings", + "AWSVPCFlow", + "AWSWAF", + "AzureAssessmentRecommendation", + "AzureMetricsV2", + "CommonSecurityLog", + "CrowdStrikeAlerts", + "CrowdStrikeAPIActivityAudit", + "CrowdStrikeAuthActivityAudit", + "CrowdStrikeCases", + "CrowdStrikeCSPMIOAStreaming", + "CrowdStrikeCSPMSearchStreaming", + "CrowdStrikeCustomerIOC", + "CrowdStrikeDetections", + "CrowdStrikeHosts", + "CrowdStrikeIncidents", + "CrowdStrikeReconNotificationSummary", + "CrowdStrikeRemoteResponseSessionEnd", + "CrowdStrikeRemoteResponseSessionStart", + "CrowdStrikeScheduledReportNotification", + "CrowdStrikeUserActivityAudit", + "CrowdStrikeVulnerabilities", + "DeviceTvmSecureConfigurationAssessmentKB", + "DeviceTvmSoftwareVulnerabilitiesKB", + "DnsAuditEvents", + "Event", + "ExchangeAssessmentRecommendation", + "ExchangeOnlineAssessmentRecommendation", + "GCPApigee", + "GCPAuditLogs", + "GCPCDN", + "GCPCloudRun", + "GCPCloudSQL", + "GCPComputeEngine", + "GCPDNS", + "GCPFirewallLogs", + "GCPIAM", + "GCPIDS", + "GCPMonitoring", + "GCPNAT", + "GCPNATAudit", + "GCPResourceManager", + "GCPVPCFlow", + "GKEAPIServer", + "GKEApplication", + "GKEAudit", + "GKEControllerManager", + "GKEHPADecision", + "GKEScheduler", + "GoogleCloudSCC", + "GoogleWorkspaceReports", + "IlumioInsights", + "OTelLogs", + "QualysKnowledgeBase", + "Rapid7InsightVMCloudAssets", + "Rapid7InsightVMCloudVulnerabilities", + "SCCMAssessmentRecommendation", + "SCOMAssessmentRecommendation", + "SecurityEvent", + "SentinelAlibabaCloudAPIGatewayLogs", + "SentinelAlibabaCloudVPCFlowLogs", + "SentinelAlibabaCloudWAFLogs", + "SentinelTheHiveData", + "SfBAssessmentRecommendation", + "SfBOnlineAssessmentRecommendation", + "SharePointOnlineAssessmentRecommendation", + "SPAssessmentRecommendation", + "SQLAssessmentRecommendation", + "StorageInsightsAccountPropertiesDaily", + "StorageInsightsDailyMetrics", + "StorageInsightsHourlyMetrics", + "StorageInsightsMonthlyMetrics", + "StorageInsightsWeeklyMetrics", + "Syslog", + "ThreatIntelIndicators", + "ThreatIntelligenceIndicator", + "ThreatIntelObjects", + "UCClient", + "UCClientReadinessStatus", + "UCClientUpdateStatus", + "UCDeviceAlert", + "UCDOAggregatedStatus", + "UCDOStatus", + "UCServiceUpdateStatus", + "UCUpdateAlert", + "WindowsClientAssessmentRecommendation", + "WindowsEvent", + "WindowsServerAssessmentRecommendation" +] diff --git a/plugin/skills/azure-data-collection-rules/references/supported-tables.md b/plugin/skills/azure-data-collection-rules/references/supported-tables.md new file mode 100644 index 000000000..59f900d63 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/references/supported-tables.md @@ -0,0 +1,146 @@ +# Standard Tables Accepting Custom Streams + +These standard tables accept data from custom streams via the Log Ingestion API or custom stream routing. Tables not on this list require standard streams. + +> **Note:** This list may change. Check the [current documentation](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/logs-ingestion-api-overview#supported-tables) for updates. + +### Assessment & Recommendations +- ADAssessmentRecommendation +- ADSecurityAssessmentRecommendation +- AzureAssessmentRecommendation +- ExchangeAssessmentRecommendation +- ExchangeOnlineAssessmentRecommendation +- SCCMAssessmentRecommendation +- SCOMAssessmentRecommendation +- SfBAssessmentRecommendation +- SfBOnlineAssessmentRecommendation +- SharePointOnlineAssessmentRecommendation +- SPAssessmentRecommendation +- SQLAssessmentRecommendation +- WindowsClientAssessmentRecommendation +- WindowsServerAssessmentRecommendation + +### Core Monitoring +- AzureMetricsV2 +- CommonSecurityLog +- Event +- OTelLogs +- SecurityEvent +- Syslog +- WindowsEvent + +### ASIM (Normalized Security) +- ASimAuditEventLogs +- ASimAuthenticationEventLogs +- ASimDhcpEventLogs +- ASimDnsActivityLogs +- ASimFileEventLogs +- ASimNetworkSessionLogs +- ASimProcessEventLogs +- ASimRegistryEventLogs +- ASimUserManagementActivityLogs +- ASimWebSessionLogs + +### SAP (ABAP) +- ABAPAuditLog +- ABAPAuthorizationDetails +- ABAPChangeDocsLog +- ABAPUserDetails + +### AWS +- AWSALBAccessLogs +- AWSCloudTrail +- AWSCloudWatch +- AWSEKS +- AWSELBFlowLogs +- AWSGuardDuty +- AWSNetworkFirewallAlert +- AWSNetworkFirewallFlow +- AWSNetworkFirewallTls +- AWSNLBAccessLogs +- AWSRoute53Resolver +- AWSS3ServerAccess +- AWSSecurityHubFindings +- AWSVPCFlow +- AWSWAF + +### GCP +- GCPApigee +- GCPAuditLogs +- GCPCDN +- GCPCloudRun +- GCPCloudSQL +- GCPComputeEngine +- GCPDNS +- GCPFirewallLogs +- GCPIAM +- GCPIDS +- GCPMonitoring +- GCPNAT +- GCPNATAudit +- GCPResourceManager +- GCPVPCFlow +- GKEAPIServer +- GKEApplication +- GKEAudit +- GKEControllerManager +- GKEHPADecision +- GKEScheduler +- GoogleCloudSCC +- GoogleWorkspaceReports + +### Third-Party Security +- Anomalies +- CrowdStrikeAlerts +- CrowdStrikeAPIActivityAudit +- CrowdStrikeAuthActivityAudit +- CrowdStrikeCases +- CrowdStrikeCSPMIOAStreaming +- CrowdStrikeCSPMSearchStreaming +- CrowdStrikeCustomerIOC +- CrowdStrikeDetections +- CrowdStrikeHosts +- CrowdStrikeIncidents +- CrowdStrikeReconNotificationSummary +- CrowdStrikeRemoteResponseSessionEnd +- CrowdStrikeRemoteResponseSessionStart +- CrowdStrikeScheduledReportNotification +- CrowdStrikeUserActivityAudit +- CrowdStrikeVulnerabilities +- IlumioInsights +- QualysKnowledgeBase +- Rapid7InsightVMCloudAssets +- Rapid7InsightVMCloudVulnerabilities +- SentinelAlibabaCloudAPIGatewayLogs +- SentinelAlibabaCloudVPCFlowLogs +- SentinelAlibabaCloudWAFLogs +- SentinelTheHiveData + +### Threat Intelligence +- ThreatIntelIndicators +- ThreatIntelligenceIndicator +- ThreatIntelObjects + +### DNS +- DnsAuditEvents + +### Vulnerability Management +- DeviceTvmSecureConfigurationAssessmentKB +- DeviceTvmSoftwareVulnerabilitiesKB + +### Windows Update +- UCClient +- UCClientReadinessStatus +- UCClientUpdateStatus +- UCDeviceAlert +- UCDOAggregatedStatus +- UCDOStatus +- UCServiceUpdateStatus +- UCUpdateAlert + +### Storage Insights +- StorageInsightsAccountPropertiesDaily +- StorageInsightsDailyMetrics +- StorageInsightsHourlyMetrics +- StorageInsightsMonthlyMetrics +- StorageInsightsWeeklyMetrics diff --git a/plugin/skills/azure-data-collection-rules/scripts/create-custom-table.ps1 b/plugin/skills/azure-data-collection-rules/scripts/create-custom-table.ps1 new file mode 100644 index 000000000..6669cdfc0 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/create-custom-table.ps1 @@ -0,0 +1,83 @@ +<# +.SYNOPSIS + Creates or updates a custom Log Analytics table. +.PARAMETER SubscriptionId + Azure subscription ID. +.PARAMETER ResourceGroupName + Resource group containing the workspace. +.PARAMETER WorkspaceName + Log Analytics workspace name. +.PARAMETER TableName + Table name (must end with _CL). +.PARAMETER SchemaFilePath + Path to a JSON file defining the table schema. +.PARAMETER RetentionInDays + Data retention in days. Default: 30. +.PARAMETER TotalRetentionInDays + Total retention including archive. Default: 90. +.PARAMETER Plan + Table plan: Analytics, Basic, or Auxiliary. Default: Analytics. +.EXAMPLE + .\create-custom-table.ps1 -SubscriptionId "xxx" -ResourceGroupName "my-rg" -WorkspaceName "my-ws" -TableName "MyLogs_CL" -SchemaFilePath "table-schema.json" + + Schema file format: + { + "columns": [ + { "name": "TimeGenerated", "type": "datetime" }, + { "name": "Computer", "type": "string" }, + { "name": "Message", "type": "string" } + ] + } +#> +param( + [Parameter(Mandatory)][string]$SubscriptionId, + [Parameter(Mandatory)][string]$ResourceGroupName, + [Parameter(Mandatory)][string]$WorkspaceName, + [Parameter(Mandatory)][string]$TableName, + [Parameter(Mandatory)][string]$SchemaFilePath, + [int]$RetentionInDays = 30, + [int]$TotalRetentionInDays = 90, + [ValidateSet("Analytics", "Basic", "Auxiliary")][string]$Plan = "Analytics", + [string]$ApiVersion = "2022-10-01" +) + +if (-not $TableName.EndsWith("_CL")) { + Write-Error "Custom table name must end with '_CL'" + exit 1 +} + +if (-not (Test-Path $SchemaFilePath)) { + Write-Error "Schema file not found: $SchemaFilePath" + exit 1 +} + +$schema = Get-Content -Path $SchemaFilePath -Raw | ConvertFrom-Json + +# Verify TimeGenerated exists +$hasTimeGenerated = $schema.columns | Where-Object { $_.name -eq "TimeGenerated" -and $_.type -eq "datetime" } +if (-not $hasTimeGenerated) { + Write-Error "Schema must include a 'TimeGenerated' column of type 'datetime'" + exit 1 +} + +$body = @{ + properties = @{ + schema = @{ + name = $TableName + columns = $schema.columns + } + retentionInDays = $RetentionInDays + totalRetentionInDays = $TotalRetentionInDays + plan = $Plan + } +} | ConvertTo-Json -Depth 10 + +$path = "/subscriptions/$SubscriptionId/resourceGroups/$ResourceGroupName/providers/Microsoft.OperationalInsights/workspaces/$WorkspaceName/tables/$TableName`?api-version=$ApiVersion" +$response = Invoke-AzRestMethod -Path $path -Method PUT -Payload $body + +if ($response.StatusCode -in 200, 202) { + Write-Host "Table '$TableName' created/updated successfully." +} else { + Write-Error "Failed. Status: $($response.StatusCode). Content: $($response.Content)" + exit 1 +} diff --git a/plugin/skills/azure-data-collection-rules/scripts/get-dcr.ps1 b/plugin/skills/azure-data-collection-rules/scripts/get-dcr.ps1 new file mode 100644 index 000000000..f4c5ad85d --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/get-dcr.ps1 @@ -0,0 +1,41 @@ +<# +.SYNOPSIS + Retrieves an existing Data Collection Rule via Azure REST API. +.PARAMETER SubscriptionId + Azure subscription ID. +.PARAMETER ResourceGroupName + Resource group containing the DCR. +.PARAMETER DcrName + Name of the Data Collection Rule. +.PARAMETER OutputPath + Optional. File path to save the DCR JSON. If omitted, outputs to console. +.PARAMETER ApiVersion + API version. Defaults to 2025-05-11 (multi-stage support). +.EXAMPLE + .\get-dcr.ps1 -SubscriptionId "xxx" -ResourceGroupName "my-rg" -DcrName "my-dcr" + .\get-dcr.ps1 -SubscriptionId "xxx" -ResourceGroupName "my-rg" -DcrName "my-dcr" -OutputPath "dcr.json" +#> +param( + [Parameter(Mandatory)][string]$SubscriptionId, + [Parameter(Mandatory)][string]$ResourceGroupName, + [Parameter(Mandatory)][string]$DcrName, + [string]$OutputPath, + [string]$ApiVersion = "2025-05-11" +) + +$path = "/subscriptions/$SubscriptionId/resourceGroups/$ResourceGroupName/providers/Microsoft.Insights/dataCollectionRules/$DcrName`?api-version=$ApiVersion" +$response = Invoke-AzRestMethod -Path $path -Method GET + +if ($response.StatusCode -ne 200) { + Write-Error "Failed to get DCR. Status: $($response.StatusCode). Content: $($response.Content)" + exit 1 +} + +$json = $response.Content | ConvertFrom-Json | ConvertTo-Json -Depth 20 + +if ($OutputPath) { + $json | Set-Content -Path $OutputPath -Encoding utf8 + Write-Host "DCR saved to $OutputPath" +} else { + Write-Output $json +} diff --git a/plugin/skills/azure-data-collection-rules/scripts/get-table-schema.ps1 b/plugin/skills/azure-data-collection-rules/scripts/get-table-schema.ps1 new file mode 100644 index 000000000..d8fcd5dab --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/get-table-schema.ps1 @@ -0,0 +1,39 @@ +<# +.SYNOPSIS + Retrieves the schema of a Log Analytics table. +.PARAMETER SubscriptionId + Azure subscription ID. +.PARAMETER ResourceGroupName + Resource group containing the workspace. +.PARAMETER WorkspaceName + Log Analytics workspace name. +.PARAMETER TableName + Table name (e.g., "Syslog", "MyCustom_CL"). +.EXAMPLE + .\get-table-schema.ps1 -SubscriptionId "xxx" -ResourceGroupName "my-rg" -WorkspaceName "my-ws" -TableName "Syslog" +#> +param( + [Parameter(Mandatory)][string]$SubscriptionId, + [Parameter(Mandatory)][string]$ResourceGroupName, + [Parameter(Mandatory)][string]$WorkspaceName, + [Parameter(Mandatory)][string]$TableName, + [string]$ApiVersion = "2022-10-01" +) + +$path = "/subscriptions/$SubscriptionId/resourceGroups/$ResourceGroupName/providers/Microsoft.OperationalInsights/workspaces/$WorkspaceName/tables/$TableName`?api-version=$ApiVersion" +$response = Invoke-AzRestMethod -Path $path -Method GET + +if ($response.StatusCode -ne 200) { + Write-Error "Failed to get table schema. Status: $($response.StatusCode). Content: $($response.Content)" + exit 1 +} + +$table = $response.Content | ConvertFrom-Json +$columns = $table.properties.schema.columns + +Write-Host "Table: $($table.properties.schema.name)" +Write-Host "Plan: $($table.properties.plan)" +Write-Host "Retention: $($table.properties.retentionInDays) days" +Write-Host "" +Write-Host "Columns:" +$columns | Format-Table name, type -AutoSize diff --git a/plugin/skills/azure-data-collection-rules/scripts/put-dcr.ps1 b/plugin/skills/azure-data-collection-rules/scripts/put-dcr.ps1 new file mode 100644 index 000000000..3fe44fc9e --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/put-dcr.ps1 @@ -0,0 +1,49 @@ +<# +.SYNOPSIS + Creates or updates a Data Collection Rule via Azure REST API. +.PARAMETER SubscriptionId + Azure subscription ID. +.PARAMETER ResourceGroupName + Resource group for the DCR. +.PARAMETER DcrName + Name of the Data Collection Rule. +.PARAMETER DcrFilePath + Path to the DCR JSON file. +.PARAMETER ApiVersion + API version. Defaults to 2025-05-11 (multi-stage support). +.EXAMPLE + .\put-dcr.ps1 -SubscriptionId "xxx" -ResourceGroupName "my-rg" -DcrName "my-dcr" -DcrFilePath "dcr.json" +#> +param( + [Parameter(Mandatory)][string]$SubscriptionId, + [Parameter(Mandatory)][string]$ResourceGroupName, + [Parameter(Mandatory)][string]$DcrName, + [Parameter(Mandatory)][string]$DcrFilePath, + [string]$ApiVersion = "2025-05-11" +) + +if (-not (Test-Path $DcrFilePath)) { + Write-Error "DCR file not found: $DcrFilePath" + exit 1 +} + +$payload = Get-Content -Path $DcrFilePath -Raw + +# Basic JSON validation +try { + $null = $payload | ConvertFrom-Json -ErrorAction Stop +} catch { + Write-Error "Invalid JSON in $DcrFilePath : $_" + exit 1 +} + +$path = "/subscriptions/$SubscriptionId/resourceGroups/$ResourceGroupName/providers/Microsoft.Insights/dataCollectionRules/$DcrName`?api-version=$ApiVersion" +$response = Invoke-AzRestMethod -Path $path -Method PUT -Payload $payload + +if ($response.StatusCode -in 200, 201) { + Write-Host "DCR '$DcrName' deployed successfully. Status: $($response.StatusCode)" + $response.Content | ConvertFrom-Json | ConvertTo-Json -Depth 5 +} else { + Write-Error "Failed to deploy DCR. Status: $($response.StatusCode). Content: $($response.Content)" + exit 1 +} diff --git a/plugin/skills/azure-data-collection-rules/scripts/send-logs.ps1 b/plugin/skills/azure-data-collection-rules/scripts/send-logs.ps1 new file mode 100644 index 000000000..0ec3f2a46 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/send-logs.ps1 @@ -0,0 +1,87 @@ +<# +.SYNOPSIS + Sends sample data to Azure Monitor via the Log Ingestion API. +.DESCRIPTION + Authenticates using an Entra app registration (client credentials flow) + and POSTs a JSON array to the DCR logs ingestion endpoint. +.PARAMETER TenantId + Entra tenant ID. +.PARAMETER AppId + Application (client) ID of the registered app. +.PARAMETER AppSecret + Client secret value. +.PARAMETER EndpointUri + DCR logs ingestion endpoint URI (e.g., https://my-dcr-xyz.eastus-1.ingest.monitor.azure.com). +.PARAMETER DcrImmutableId + The immutableId of the DCR (e.g., dcr-00000000000000000000000000000000). +.PARAMETER StreamName + Stream name in the DCR (e.g., Custom-MyAppLogs). +.PARAMETER DataFilePath + Path to a JSON file containing an array of log records. +.EXAMPLE + .\send-logs.ps1 -TenantId "xxx" -AppId "yyy" -AppSecret "zzz" ` + -EndpointUri "https://my-dcr.eastus-1.ingest.monitor.azure.com" ` + -DcrImmutableId "dcr-abc123" -StreamName "Custom-MyAppLogs" ` + -DataFilePath "sample-data.json" +#> +param( + [Parameter(Mandatory)][string]$TenantId, + [Parameter(Mandatory)][string]$AppId, + [Parameter(Mandatory)][string]$AppSecret, + [Parameter(Mandatory)][string]$EndpointUri, + [Parameter(Mandatory)][string]$DcrImmutableId, + [Parameter(Mandatory)][string]$StreamName, + [Parameter(Mandatory)][string]$DataFilePath +) + +if (-not (Test-Path $DataFilePath)) { + Write-Error "Data file not found: $DataFilePath" + exit 1 +} + +$data = Get-Content -Path $DataFilePath -Raw + +# Validate JSON array +try { + $parsed = $data | ConvertFrom-Json -ErrorAction Stop + if ($parsed -isnot [System.Array]) { + Write-Error "Data must be a JSON array (wrap in [ ])" + exit 1 + } +} catch { + Write-Error "Invalid JSON: $_" + exit 1 +} + +# Step 1: Get bearer token +$tokenBody = @{ + client_id = $AppId + scope = "https://monitor.azure.com/.default" + client_secret = $AppSecret + grant_type = "client_credentials" +} +$tokenUri = "https://login.microsoftonline.com/$TenantId/oauth2/v2.0/token" + +try { + $tokenResponse = Invoke-RestMethod -Uri $tokenUri -Method Post -Body $tokenBody -ContentType "application/x-www-form-urlencoded" + $bearerToken = $tokenResponse.access_token +} catch { + Write-Error "Failed to acquire token: $_" + exit 1 +} + +# Step 2: Send data +$sendHeaders = @{ + "Authorization" = "Bearer $bearerToken" + "Content-Type" = "application/json" +} +$sendUri = "$EndpointUri/dataCollectionRules/$DcrImmutableId/streams/$($StreamName)?api-version=2023-01-01" + +try { + $response = Invoke-RestMethod -Uri $sendUri -Method Post -Body $data -Headers $sendHeaders + Write-Host "Data sent successfully. Records: $($parsed.Count)" +} catch { + $statusCode = if ($_.Exception.Response) { $_.Exception.Response.StatusCode.value__ } else { 'N/A' } + Write-Error "Failed to send data. Status: $statusCode. Error: $_" + exit 1 +} diff --git a/plugin/skills/azure-data-collection-rules/scripts/validate-dcr.ps1 b/plugin/skills/azure-data-collection-rules/scripts/validate-dcr.ps1 new file mode 100644 index 000000000..eada1fff0 --- /dev/null +++ b/plugin/skills/azure-data-collection-rules/scripts/validate-dcr.ps1 @@ -0,0 +1,303 @@ +<# +.SYNOPSIS + Validates a DCR JSON file for common structural issues before deployment. +.PARAMETER DcrFilePath + Path to the DCR JSON file. +.EXAMPLE + .\validate-dcr.ps1 -DcrFilePath "dcr.json" +#> +param( + [Parameter(Mandatory)][string]$DcrFilePath +) + +if (-not (Test-Path $DcrFilePath)) { + Write-Error "File not found: $DcrFilePath" + exit 1 +} + +$errors = @() +$warnings = @() + +try { + $dcr = Get-Content -Path $DcrFilePath -Raw | ConvertFrom-Json -ErrorAction Stop +} catch { + Write-Error "Invalid JSON: $_" + exit 1 +} + +# Navigate to properties +$props = $dcr.properties +if (-not $props) { + $props = $dcr # Maybe the file IS the properties object +} + +# Check required sections +$isDirect = $false +if ($dcr.kind -eq 'Direct') { + $isDirect = $true +} + +if (-not $isDirect -and -not $props.dataSources) { $errors += "Missing 'dataSources' section (required for non-Direct DCRs)" } +if ($isDirect -and $props.dataSources) { $warnings += "Direct DCR should not have a 'dataSources' section" } +if (-not $props.destinations) { $errors += "Missing 'destinations' section" } +if (-not $props.dataFlows) { $errors += "Missing 'dataFlows' section" } + +# Collect declared custom streams +$declaredStreams = @() +if ($props.streamDeclarations) { + $declaredStreams = $props.streamDeclarations.PSObject.Properties.Name +} + +# Collect named transformations +$namedTransforms = @() +if ($props.transformations) { + $namedTransforms = $props.transformations | ForEach-Object { $_.name } +} + +# Collect destination names +$destNames = @() +if ($props.destinations) { + foreach ($destType in $props.destinations.PSObject.Properties) { + foreach ($dest in $destType.Value) { + if ($dest.name) { $destNames += $dest.name } + } + } +} + +# Validate data sources +if ($props.dataSources) { + foreach ($dsType in $props.dataSources.PSObject.Properties) { + foreach ($ds in $dsType.Value) { + # Check transform references + if ($ds.transform -and $ds.transform -notin $namedTransforms) { + $errors += "DataSource '$($ds.name)' references transform '$($ds.transform)' which is not defined in transformations" + } + # Check custom stream declarations + # Note: data sources with a 'transform' reference use implicitly-derived custom streams + # that should NOT be in streamDeclarations (except logFiles). Skip this check for those. + if ($ds.streams) { + foreach ($stream in $ds.streams) { + if ($stream.StartsWith("Custom-") -and $stream -notin $declaredStreams) { + if ($ds.transform) { + # Transform-derived custom stream — implicit schema, OK to skip declaration + } elseif ($dsType.Name -eq 'logFiles') { + $errors += "DataSource '$($ds.name)' uses stream '$stream' not declared in streamDeclarations" + } else { + $errors += "DataSource '$($ds.name)' uses stream '$stream' not declared in streamDeclarations" + } + } + } + } + } + } +} + +# Validate data flows +# Standard tables that accept custom streams (loaded from centralized JSON) +$supportedTablesPath = Join-Path $PSScriptRoot '..\references\supported-tables.json' +if (Test-Path $supportedTablesPath) { + $supportedStandardTables = Get-Content -Path $supportedTablesPath -Raw | ConvertFrom-Json +} else { + $warnings += "Could not find supported-tables.json at '$supportedTablesPath'. Custom-stream-to-standard-table routing validation skipped." + $supportedStandardTables = @() +} + +if ($props.dataFlows) { + foreach ($df in $props.dataFlows) { + # Check mutual exclusivity + if ($df.transform -and $df.transformKql) { + $errors += "DataFlow has both 'transform' and 'transformKql' (mutually exclusive)" + } + # Check transform references + if ($df.transform -and $df.transform -notin $namedTransforms) { + $errors += "DataFlow references transform '$($df.transform)' which is not defined in transformations" + } + # Check destination references + if ($df.destinations) { + foreach ($dest in $df.destinations) { + if ($dest -notin $destNames) { + $errors += "DataFlow references destination '$dest' which is not defined in destinations" + } + } + } + # Check custom streams + # Note: transform-derived custom streams from data sources are implicitly declared + if ($df.streams) { + $transformDerivedStreams = @() + if ($props.dataSources) { + foreach ($dsType2 in $props.dataSources.PSObject.Properties) { + foreach ($ds2 in $dsType2.Value) { + if ($ds2.transform -and $ds2.streams) { $transformDerivedStreams += $ds2.streams } + } + } + } + foreach ($stream in $df.streams) { + if ($stream.StartsWith("Custom-") -and $stream -notin $declaredStreams -and $stream -notin $transformDerivedStreams) { + $errors += "DataFlow uses stream '$stream' not declared in streamDeclarations" + } + } + } + # Check routing rules + if ($df.outputStream -and $df.streams) { + $inputIsCustom = $df.streams | Where-Object { $_.StartsWith("Custom-") } + $outputIsStandard = $df.outputStream.StartsWith("Microsoft-") + $outputIsCustom = $df.outputStream.StartsWith("Custom-") + + # Rule: standard stream cannot route to custom table (unless transformKql is present) + $inputIsStandard = $df.streams | Where-Object { $_.StartsWith("Microsoft-") } + if ($inputIsStandard -and $outputIsCustom -and -not $df.transformKql) { + $errors += "DataFlow routes standard stream to custom table '$($df.outputStream)'. Standard streams cannot route to custom tables without transformKql. Add transformKql (even 'source' for pass-through) or use a custom stream." + } + + # Rule: custom stream to standard table must be on supported list + if ($inputIsCustom -and $outputIsStandard) { + $tableName = $df.outputStream -replace '^Microsoft-', '' + if ($tableName -notin $supportedStandardTables) { + $errors += "DataFlow routes custom stream to standard table '$tableName' which is not on the supported tables list. Use the standard stream or route to a custom table." + } + } + } + } +} + +# Validate transformations +if ($props.transformations) { + foreach ($t in $props.transformations) { + if (-not $t.name) { $errors += "Transformation missing 'name'" } + if (-not $t.headerProcessor) { $errors += "Transformation '$($t.name)' missing 'headerProcessor'" } + } +} + +# ── Limits validation (from references/limits.md) ── + +# DCR Structure Limits +$dsCount = 0 +if ($props.dataSources) { + foreach ($dsType in $props.dataSources.PSObject.Properties) { + if ($dsType.Value -is [System.Collections.IEnumerable] -and $dsType.Value -isnot [string]) { + $dsCount += @($dsType.Value).Count + } else { + $dsCount++ + } + } +} +if ($dsCount -gt 10) { $errors += "DCR has $dsCount data sources (limit: 10)" } + +if ($props.dataFlows -and @($props.dataFlows).Count -gt 10) { + $errors += "DCR has $(@($props.dataFlows).Count) data flows (limit: 10)" +} + +$streamCount = 0 +if ($props.streamDeclarations) { $streamCount += $props.streamDeclarations.PSObject.Properties.Count } +# Count Microsoft-* streams referenced in dataFlows +$msStreams = @() +if ($props.dataFlows) { + foreach ($df in $props.dataFlows) { + if ($df.streams) { + foreach ($s in $df.streams) { + if ($s.StartsWith("Microsoft-") -and $s -notin $msStreams) { $msStreams += $s } + } + } + } +} +$totalStreams = $streamCount + $msStreams.Count +if ($totalStreams -gt 20) { $errors += "DCR has $totalStreams streams (limit: 20)" } + +$laDestCount = 0 +if ($props.destinations -and $props.destinations.logAnalytics) { + $laDestCount = @($props.destinations.logAnalytics).Count +} +if ($laDestCount -gt 10) { $errors += "DCR has $laDestCount Log Analytics destinations (limit: 10)" } + +# transformKql character limit (15,360) +if ($props.dataFlows) { + foreach ($df in $props.dataFlows) { + if ($df.transformKql -and $df.transformKql.Length -gt 15360) { + $errors += "DataFlow transformKql is $($df.transformKql.Length) chars (limit: 15,360)" + } + } +} + +# Performance counter specifiers limit (100 per data source) +if ($props.dataSources -and $props.dataSources.performanceCounters) { + foreach ($pc in $props.dataSources.performanceCounters) { + if ($pc.counterSpecifiers -and @($pc.counterSpecifiers).Count -gt 100) { + $errors += "Performance counter '$($pc.name)' has $(@($pc.counterSpecifiers).Count) specifiers (limit: 100)" + } + } +} + +# Syslog facility names limit (20 per data source) +if ($props.dataSources -and $props.dataSources.syslog) { + foreach ($sl in $props.dataSources.syslog) { + if ($sl.facilityNames -and @($sl.facilityNames).Count -gt 20) { + $errors += "Syslog '$($sl.name)' has $(@($sl.facilityNames).Count) facility names (limit: 20)" + } + } +} + +# Windows Event Log xPathQueries limit (100 per data source) +if ($props.dataSources -and $props.dataSources.windowsEventLogs) { + foreach ($wel in $props.dataSources.windowsEventLogs) { + if ($wel.xPathQueries -and @($wel.xPathQueries).Count -gt 100) { + $errors += "WindowsEventLog '$($wel.name)' has $(@($wel.xPathQueries).Count) xPathQueries (limit: 100)" + } + } +} + +# DCR name validation for Direct kind +if ($dcr.kind -eq 'Direct' -and $dcr.name) { + if ($dcr.name.Length -lt 3 -or $dcr.name.Length -gt 30) { + $errors += "Direct DCR name '$($dcr.name)' must be 3-30 characters (current: $($dcr.name.Length))" + } + if ($dcr.name -notmatch '^[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]$') { + $errors += "Direct DCR name '$($dcr.name)' must be alphanumeric + hyphens only (DNS-safe)" + } +} + +# Stream declaration column validation +if ($props.streamDeclarations) { + $validColTypes = @('string','int','long','real','boolean','dynamic','datetime') + foreach ($streamProp in $props.streamDeclarations.PSObject.Properties) { + $streamName = $streamProp.Name + $stream = $streamProp.Value + if ($stream.columns) { + $cols = @($stream.columns) + if ($cols.Count -gt 1000) { + $errors += "Stream '$streamName' has $($cols.Count) columns (limit: 1,000)" + } + foreach ($col in $cols) { + if ($col.name.Length -gt 60) { + $errors += "Stream '$streamName' column '$($col.name)' exceeds 60 char name limit" + } + if ($col.name -notmatch '^[a-zA-Z][a-zA-Z0-9_]*$') { + $errors += "Stream '$streamName' column '$($col.name)' has invalid name (must start with letter, only alphanumeric + underscore)" + } + if ($col.type -and $col.type -notin $validColTypes) { + $errors += "Stream '$streamName' column '$($col.name)' has unsupported type '$($col.type)' (use: $($validColTypes -join ', '))" + } + } + } + # Custom stream naming + if ($streamName.StartsWith('Microsoft-')) { + $errors += "Stream '$streamName' in streamDeclarations must not start with 'Microsoft-'. Standard streams have implicit schemas and should not be declared." + } elseif (-not $streamName.StartsWith('Custom-')) { + $errors += "Stream '$streamName' must start with 'Custom-' (standard streams use 'Microsoft-' prefix and should not appear in streamDeclarations)" + } + } +} + +# Report +if ($errors.Count -eq 0 -and $warnings.Count -eq 0) { + Write-Host "Validation PASSED. No issues found." -ForegroundColor Green +} else { + if ($errors.Count -gt 0) { + Write-Host "ERRORS ($($errors.Count)):" -ForegroundColor Red + $errors | ForEach-Object { Write-Host " - $_" -ForegroundColor Red } + } + if ($warnings.Count -gt 0) { + Write-Host "WARNINGS ($($warnings.Count)):" -ForegroundColor Yellow + $warnings | ForEach-Object { Write-Host " - $_" -ForegroundColor Yellow } + } + if ($errors.Count -gt 0) { exit 1 } +}