Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,14 @@ protected static List<string> GetLocalProviderNames(Regex? regex)
}

/// <summary>
/// Yields <see cref="ProviderDetails" /> for each local provider, applying the optional regex filter and skip set
/// before metadata is resolved.
/// Yields <see cref="ProviderDetails" /> for each local provider, applying the optional regex filter and
/// name-level <paramref name="excludeProviderNames" /> exclude set before metadata is resolved. Local providers are
/// live, so their identity is always <c>(name, "")</c>; a name-level exclude is therefore exact here.
/// </summary>
protected static async IAsyncEnumerable<ProviderDetails> LoadLocalProvidersAsync(
ITraceLogger logger,
Regex? regex,
IReadOnlySet<string>? skipProviderNames = null,
IReadOnlySet<string>? excludeProviderNames = null,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
// Local-provider resolution is synchronous (registry/metadata reads). This wrapper exposes it as an
Expand All @@ -101,7 +102,7 @@ protected static async IAsyncEnumerable<ProviderDetails> LoadLocalProvidersAsync
{
cancellationToken.ThrowIfCancellationRequested();

if (skipProviderNames is not null && skipProviderNames.Contains(providerName)) { continue; }
if (excludeProviderNames is not null && excludeProviderNames.Contains(providerName)) { continue; }

yield return new EventMessageProvider(providerName, logger: logger).LoadProviderDetails();
}
Expand Down
158 changes: 134 additions & 24 deletions src/EventLogExpert.DatabaseTools/Common/Operations/ProviderSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,37 @@ internal static class ProviderSource
private const string DbExtension = ".db";
private const string EvtxExtension = ".evtx";

/// <summary>
/// Returns the distinct provider identities (name + content <see cref="ProviderDetails.VersionKey" />) available
/// from <paramref name="path" />, applying an optional <paramref name="regex" /> to filter by name. Live providers
/// discovered from an .evtx source contribute <c>(name, "")</c> because event records expose only the name. Does not
/// load full provider details.
/// </summary>
public static async Task<IReadOnlyList<ProviderIdentity>> LoadProviderIdentitiesAsync(
string path,
ITraceLogger logger,
Regex? regex = null,
CancellationToken cancellationToken = default)
{
var identities = new HashSet<ProviderIdentity>();

foreach (var file in EnumerateSourceFiles(path, logger))
{
cancellationToken.ThrowIfCancellationRequested();

foreach (var identity in await LoadIdentitiesFromFileAsync(file, logger, cancellationToken))
{
if (regex is null || regex.IsMatch(identity.ProviderName)) { identities.Add(identity); }
}
}

return identities
.OrderBy(identity => identity.ProviderName, StringComparer.OrdinalIgnoreCase)
.ThenBy(identity => identity.ProviderName, StringComparer.Ordinal)
.ThenBy(identity => identity.VersionKey, StringComparer.Ordinal)
.ToList();
Comment thread
jschick04 marked this conversation as resolved.
}

/// <summary>
/// Returns the distinct provider names available from <paramref name="path" />, applying an optional
/// <paramref name="regex" /> to filter names. Case sensitivity follows the caller's <see cref="RegexOptions" />. Does
Expand All @@ -57,14 +88,21 @@ public static async Task<IReadOnlyList<string>> LoadProviderNamesAsync(
return regex is null ? names.ToList() : names.Where(n => regex.IsMatch(n)).ToList();
}

/// <summary>
/// Streams full <see cref="ProviderDetails" /> from <paramref name="path" />, skipping providers two ways:
/// <paramref name="excludeProviderNames" /> drops EVERY version of a named provider (a user / name-level exclude),
/// while <paramref name="skipIdentities" /> drops only specific <c>(name, version)</c> identities (e.g. versions
/// already present in a merge/diff target). A provider is skipped if either set matches.
/// </summary>
public static IAsyncEnumerable<ProviderDetails> LoadProvidersAsync(
string path,
ITraceLogger logger,
Regex? regex = null,
IReadOnlySet<string>? skipProviderNames = null,
IReadOnlySet<string>? excludeProviderNames = null,
IReadOnlySet<ProviderIdentity>? skipIdentities = null,
IReadOnlyList<string>? preDiscoveredProviderNames = null,
CancellationToken cancellationToken = default) =>
LoadProvidersIteratorAsync(path, logger, regex, skipProviderNames, preDiscoveredProviderNames, cancellationToken);
LoadProvidersIteratorAsync(path, logger, regex, excludeProviderNames, skipIdentities, preDiscoveredProviderNames, cancellationToken);

/// <summary>Validates that <paramref name="path" /> exists and has a recognized form.</summary>
public static bool TryValidate(string path, ITraceLogger logger)
Expand Down Expand Up @@ -207,8 +245,9 @@ private static async Task<IReadOnlyList<ProviderDetails>> LoadDbDetailsAsync(
string file,
ITraceLogger logger,
Regex? regex,
IReadOnlySet<string>? skipProviderNames,
HashSet<string> seen,
IReadOnlySet<string>? excludeProviderNames,
IReadOnlySet<ProviderIdentity>? skipIdentities,
HashSet<ProviderIdentity> seen,
CancellationToken cancellationToken)
{
try
Expand All @@ -219,42 +258,69 @@ private static async Task<IReadOnlyList<ProviderDetails>> LoadDbDetailsAsync(

context.ChangeTracker.QueryTrackingBehavior = QueryTrackingBehavior.NoTracking;

// Filter by name without mutating `seen` so that a subsequent catch does not
// permanently mark these names as loaded when they were never successfully read.
var allNames = await context.ProviderDetails.Select(p => p.ProviderName).ToListAsync(cancellationToken);
var namesToLoad = allNames
.Where(name =>
// Project (name, version) identities and filter without mutating `seen`, so a subsequent catch does not
// permanently mark these identities as loaded when they were never successfully read. Dedup is
// version-aware: an identity already loaded from an earlier source file is skipped, but a different
// version of the same provider name is not.
var allIdentities = await context.ProviderDetails
.Select(p => new { p.ProviderName, p.VersionKey })
.ToListAsync(cancellationToken);

var identitiesToLoad = allIdentities
.Select(p => new ProviderIdentity(p.ProviderName, p.VersionKey))
.Where(identity =>
{
if (seen.Contains(name)) { return false; }
if (seen.Contains(identity)) { return false; }

if (regex is not null && !regex.IsMatch(identity.ProviderName)) { return false; }

if (regex is not null && !regex.IsMatch(name)) { return false; }
if (excludeProviderNames is not null && excludeProviderNames.Contains(identity.ProviderName)) { return false; }

return skipProviderNames is null || !skipProviderNames.Contains(name);
return skipIdentities is null || !skipIdentities.Contains(identity);
})
.OrderBy(n => n, StringComparer.OrdinalIgnoreCase)
.ToList();

if (namesToLoad.Count == 0) { return []; }
if (identitiesToLoad.Count == 0) { return []; }

var wantedIdentities = identitiesToLoad.ToHashSet();

// Reload full rows by NAME (SQLite cannot translate a composite-tuple IN clause), chunking to stay below
// SQLite's parameter limit (default 999). Derive the names from the identity LIST (not the HashSet) and
// de-duplicate them ORDINALLY: ProviderIdentity equality and SQLite's NOCASE collation differ on non-ASCII
// case (NOCASE folds only ASCII, OrdinalIgnoreCase folds all of Unicode), so two names differing solely by
// non-ASCII case are distinct rows that must both reach the IN clause - collapsing them via the HashSet or
// an OrdinalIgnoreCase Distinct would drop one. The reload by name can pull versions we did not ask for, so
// post-filter the materialized rows back down to the wanted identities. Today VersionKey is always empty,
// making the post-filter a no-op; it becomes load-bearing once content hashing lets versions coexist.
var namesToLoad = identitiesToLoad
.Select(identity => identity.ProviderName)
.Distinct(StringComparer.Ordinal)
.OrderBy(name => name, StringComparer.OrdinalIgnoreCase)
.ToList();

// Chunk the IN-clause to stay below SQLite's parameter limit (default 999).
var loaded = new List<ProviderDetails>(namesToLoad.Count);
var loaded = new List<ProviderDetails>(identitiesToLoad.Count);

for (var offset = 0; offset < namesToLoad.Count; offset += MaxInClauseParameters)
{
cancellationToken.ThrowIfCancellationRequested();

var chunk = namesToLoad
.Skip(offset)
.Take(MaxInClauseParameters)
.ToList();

loaded.AddRange(await context.ProviderDetails
var rows = await context.ProviderDetails
.Where(p => chunk.Contains(p.ProviderName))
.OrderBy(p => p.ProviderName)
.ToListAsync(cancellationToken));
.ThenBy(p => p.VersionKey)
.ToListAsync(cancellationToken);

loaded.AddRange(rows.Where(row => wantedIdentities.Contains(ProviderIdentity.Of(row))));
}

// Mark as seen only after a successful load so a catch for a corrupt file does
// not prevent the same provider names from being loaded from a later source file.
foreach (var name in namesToLoad) { seen.Add(name); }
// not prevent the same identities from being loaded from a later source file.
foreach (var identity in identitiesToLoad) { seen.Add(identity); }

return loaded;
}
Expand All @@ -266,6 +332,49 @@ private static async Task<IReadOnlyList<ProviderDetails>> LoadDbDetailsAsync(
}
}

private static async Task<IReadOnlyList<ProviderIdentity>> LoadIdentitiesFromFileAsync(
string file,
ITraceLogger logger,
CancellationToken cancellationToken)
{
var ext = Path.GetExtension(file);

if (string.Equals(ext, DbExtension, StringComparison.OrdinalIgnoreCase))
{
try
{
await using var providerContext = new ProviderDbContext(file, true, false, logger);

if (!IsSourceSchemaCurrent(providerContext, file, logger)) { return []; }

var pairs = await providerContext.ProviderDetails
.AsNoTracking()
.Select(p => new { p.ProviderName, p.VersionKey })
.ToListAsync(cancellationToken);

return pairs.Select(p => new ProviderIdentity(p.ProviderName, p.VersionKey)).ToList();
}
catch (DbException ex)
{
logger.Warning($"Skipping invalid database file '{file}': {ex.Message}");

return [];
}
}

if (string.Equals(ext, EvtxExtension, StringComparison.OrdinalIgnoreCase))
{
// Live providers from an exported log expose only a name (no content version), so their identity is (name, empty).
return MtaProviderSource.DiscoverProviderNames(file, logger)
.Select(name => new ProviderIdentity(name, string.Empty))
.ToList();
}

logger.Warning($"Skipping unsupported source file: {file}");

return [];
}

private static async Task<IReadOnlyList<string>> LoadNamesFromFileAsync(string file, ITraceLogger logger, CancellationToken cancellationToken)
{
var ext = Path.GetExtension(file);
Expand Down Expand Up @@ -301,11 +410,12 @@ private static async IAsyncEnumerable<ProviderDetails> LoadProvidersIteratorAsyn
string path,
ITraceLogger logger,
Regex? regex,
IReadOnlySet<string>? skipProviderNames,
IReadOnlySet<string>? excludeProviderNames,
IReadOnlySet<ProviderIdentity>? skipIdentities,
IReadOnlyList<string>? preDiscoveredProviderNames,
[EnumeratorCancellation] CancellationToken cancellationToken)
{
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var seen = new HashSet<ProviderIdentity>();
var files = EnumerateSourceFiles(path, logger).ToList();

// preDiscoveredProviderNames optimization only applies for single-file .evtx sources. For folders and .db
Expand All @@ -325,15 +435,15 @@ private static async IAsyncEnumerable<ProviderDetails> LoadProvidersIteratorAsyn
{
// Materialize the full per-file result before yielding so a mid-read DbException yields nothing for
// this file and does not corrupt the cross-file `seen` set. C# also forbids `yield` inside try/catch.
var loaded = await LoadDbDetailsAsync(file, logger, regex, skipProviderNames, seen, cancellationToken);
var loaded = await LoadDbDetailsAsync(file, logger, regex, excludeProviderNames, skipIdentities, seen, cancellationToken);

foreach (var details in loaded) { yield return details; }
}
else if (string.Equals(ext, EvtxExtension, StringComparison.OrdinalIgnoreCase))
{
var hint = canUsePreDiscovered ? preDiscoveredProviderNames : null;

foreach (var details in MtaProviderSource.LoadProviders(file, logger, regex, skipProviderNames, seen, hint))
foreach (var details in MtaProviderSource.LoadProviders(file, logger, regex, excludeProviderNames, skipIdentities, seen, hint))
{
cancellationToken.ThrowIfCancellationRequested();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
// // Licensed under the MIT License.

using EventLogExpert.DatabaseTools.Common.Operations;
using EventLogExpert.Eventing.PublisherMetadata;
using EventLogExpert.Logging.Abstractions;
using EventLogExpert.Provider.Resolution;
using EventLogExpert.ProviderDatabase.Context;
using EventLogExpert.ProviderDatabase.Hashing;
using System.Text.RegularExpressions;

namespace EventLogExpert.DatabaseTools.CreateDatabase;
Expand Down Expand Up @@ -43,7 +45,7 @@ public async Task<DatabaseToolsOutcome> ExecuteAsync(
return DatabaseToolsOutcome.Failed;
}

HashSet<string> skipProviderNames = new(StringComparer.OrdinalIgnoreCase);
HashSet<string> excludeProviderNames = new(StringComparer.OrdinalIgnoreCase);

if (!string.IsNullOrWhiteSpace(request.SkipProvidersInFile))
{
Expand All @@ -54,10 +56,10 @@ public async Task<DatabaseToolsOutcome> ExecuteAsync(

foreach (var name in await ProviderSource.LoadProviderNamesAsync(request.SkipProvidersInFile, logger, cancellationToken: cancellationToken))
{
skipProviderNames.Add(name);
excludeProviderNames.Add(name);
}

logger.Information($"Found {skipProviderNames.Count} providers in {request.SkipProvidersInFile}. These will not be included in the new database.");
logger.Information($"Found {excludeProviderNames.Count} providers in {request.SkipProvidersInFile}. These will not be included in the new database.");
}

// Defensive recompile if input has Regex.InfiniteMatchTimeout (otherwise catch below is dead).
Expand All @@ -67,6 +69,11 @@ public async Task<DatabaseToolsOutcome> ExecuteAsync(
var headerLogged = false;
var pendingForHeader = new List<ProviderDetails>(BatchSize);

// Collapse identical content arriving under different source keys (e.g. an unstamped legacy row plus an
// already-hashed row in a multi-file source): both re-hash to the same VersionKey, so the second would
// otherwise collide on the composite primary key. Track stamped identities and skip duplicates first-wins.
var stampedIdentities = new HashSet<ProviderIdentity>();

// Defer creating the DbContext (and therefore the .db file on disk) until we have
// at least one provider to persist. This prevents leaving an empty database behind
// when no provider details could be resolved.
Expand All @@ -75,13 +82,30 @@ public async Task<DatabaseToolsOutcome> ExecuteAsync(
try
{
IAsyncEnumerable<ProviderDetails> providersToAdd = request.SourcePath is null
? LoadLocalProvidersAsync(logger, filterRegex, skipProviderNames, cancellationToken)
: ProviderSource.LoadProvidersAsync(request.SourcePath, logger, filterRegex, skipProviderNames, cancellationToken: cancellationToken);
? LoadLocalProvidersAsync(logger, filterRegex, excludeProviderNames, cancellationToken)
: ProviderSource.LoadProvidersAsync(request.SourcePath, logger, filterRegex, excludeProviderNames, cancellationToken: cancellationToken);

var hostOsProvenance = request.SourcePath is null ? HostOsProvenance.Read(logger) : null;

await foreach (var details in providersToAdd.WithCancellation(cancellationToken))
{
cancellationToken.ThrowIfCancellationRequested();

// Stamp the content hash so distinct versions of a provider name coexist under the composite key and
// identical providers (across machines / OS builds) collapse to one row. Idempotent for an
// already-hashed source; computes the key for freshly-resolved (live) providers.
details.VersionKey = VersionKeyCalculator.Compute(details);

if (!stampedIdentities.Add(ProviderIdentity.Of(details))) { continue; }

if (hostOsProvenance is not null)
{
details.SourceOsBuild = hostOsProvenance.Build;
details.SourceOsRevision = hostOsProvenance.Revision;
details.SourceOsEdition = hostOsProvenance.Edition;
details.SourceOsDisplayVersion = hostOsProvenance.DisplayVersion;
}

if (!headerLogged)
{
pendingForHeader.Add(details);
Expand Down
Loading
Loading