From ad6eed8595cf895a00282b6bf4505142424612ca Mon Sep 17 00:00:00 2001 From: Ryan McKenna Date: Tue, 23 Jun 2026 16:32:46 -0700 Subject: [PATCH] Deprecate data_generation_v2.generate() in favor of TabularSynthesizer. PiperOrigin-RevId: 936961615 --- dpsynth/CHANGELOG.md => CHANGELOG.md | 4 +- docs/in_memory_api.md | 64 ++++--- docs/index.md | 2 +- docs/sitemap.md | 4 +- dpsynth/data_generation_v2.py | 272 ++------------------------- tests/data_generation_v2_test.py | 148 ++------------- 6 files changed, 73 insertions(+), 421 deletions(-) rename dpsynth/CHANGELOG.md => CHANGELOG.md (90%) diff --git a/dpsynth/CHANGELOG.md b/CHANGELOG.md similarity index 90% rename from dpsynth/CHANGELOG.md rename to CHANGELOG.md index bd13223..9648570 100644 --- a/dpsynth/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,8 @@ private synthetic data. This first release contains code for generating differentially private synthetic tabular data using marginal measurement and Private-PGM inference, including: -- **Two execution modes**: In-memory local mode (via `dpsynth.generate()`, - tested up to ~100M rows) and a distributed Apache Beam mode for larger +- **Two execution modes**: In-memory local mode + (via `dpsynth.TabularSynthesizer`, tested up to ~100M rows) and a workloads. - **Marginal-based mechanisms**: AIM, MST, Independent, and Direct mechanisms for selecting and measuring marginals under differential privacy. diff --git a/docs/in_memory_api.md b/docs/in_memory_api.md index a9868a2..9dc4fd4 100644 --- a/docs/in_memory_api.md +++ b/docs/in_memory_api.md @@ -11,32 +11,31 @@ within a single machine's RAM. -------------------------------------------------------------------------------- -## Python API: `dpsynth.generate` +## Python API: `dpsynth.TabularSynthesizer` -The primary entry point for in-memory synthesis is `dpsynth.generate()`. It -accepts a Pandas DataFrame alongside a dictionary of attribute domains and -returns a fully synthetic, differentially private DataFrame matching the exact -schema and data types of your input. +The primary entry point for in-memory synthesis is +`dpsynth.TabularSynthesizer`. It accepts a dictionary of attribute domains, +is calibrated with a privacy budget, and generates a fully synthetic, +differentially private DataFrame matching the exact schema and data types of +your input. -### Function Signature +### Usage ```python import dpsynth from dpsynth import discrete_mechanisms +import numpy as np import pandas as pd -synthetic_df = dpsynth.generate( - data: pd.DataFrame, - domains: dict[str, dpsynth.domain.AttributeType], - epsilon: float, - delta: float, - *, - discrete_config: discrete_mechanisms.DiscreteMechanismConfig = discrete_mechanisms.MSTConfig(), - numerical_bins: int = 32, - one_way_marginal_budget_fraction: float = 0.1, - cross_attribute_constraints: list = (), - skip_compression: bool = False, -) -> pd.DataFrame +synth = dpsynth.TabularSynthesizer( + domains=domains, + discrete_mechanism=discrete_mechanisms.MSTMechanism(), +) +result = synth.calibrate( + epsilon=1.0, + delta=1e-6, +)(np.random.default_rng(), sensitive_df) +synthetic_df = result.synthetic_data ``` ### Key Arguments @@ -70,6 +69,7 @@ synthetic records. import dpsynth from dpsynth import discrete_mechanisms from dpsynth import domain +import numpy as np import pandas as pd # 1. Load sensitive tabular data into Pandas @@ -78,23 +78,25 @@ sensitive_df = pd.read_csv("sensitive_transactions.csv") # 2. Load domain schema from YAML attribute_domains = domain.from_yaml_file("transaction_domain.yaml") -# 3. Configure the synthesis mechanism (AIM) -aim_config = discrete_mechanisms.AIMConfig( - seed=42, - rounds=50, - pgm_iters=1000, -) - -# 4. Generate Differentially Private synthetic data -synthetic_df = dpsynth.generate( - data=sensitive_df, +# 3. Configure and calibrate the synthesizer (AIM) +synth = dpsynth.TabularSynthesizer( domains=attribute_domains, + discrete_mechanism=discrete_mechanisms.AIMConfig( + seed=42, + rounds=50, + pgm_iters=1000, + ), +) +calibrated = synth.calibrate( epsilon=1.0, delta=1e-6, - discrete_config=aim_config, - numerical_bins=16, # Use 16 quantile buckets for numerical columns + numerical_bins=16, # Use 16 quantile buckets for numerical columns ) +# 4. Generate Differentially Private synthetic data +result = calibrated(np.random.default_rng(), sensitive_df) +synthetic_df = result.synthetic_data + # 5. Save the synthetic dataframe synthetic_df.to_csv("synthetic_transactions.csv", index=False) print("Synthetic data successfully generated!") @@ -139,7 +141,7 @@ python3 bin/main.py \ ## Under the Hood: The In-Memory Lifecycle -When you invoke `dpsynth.generate()`, the library performs the following +When you invoke `TabularSynthesizer`, the library performs the following single-machine pipeline: 1. **Discretization**: Continuous numerical columns are bucketed into diff --git a/docs/index.md b/docs/index.md index b19fae7..d336613 100644 --- a/docs/index.md +++ b/docs/index.md @@ -58,7 +58,7 @@ dataframes to massive distributed datasets across computing clusters: └────────────────────────────────────────┘ ``` -### 1. In-Memory DataFrame API (`dpsynth.generate`) +### 1. In-Memory DataFrame API (`dpsynth.TabularSynthesizer`) Optimized for rapid prototyping, research experimentation, and datasets that easily fit within single-machine memory. diff --git a/docs/sitemap.md b/docs/sitemap.md index 5bac456..058eb24 100644 --- a/docs/sitemap.md +++ b/docs/sitemap.md @@ -11,7 +11,7 @@ * [Why DPSynth?](index.md#why-dpsynth) * [Core APIs and Execution Models](index.md#core-apis-and-execution-models) - * [1. In-Memory DataFrame API (`dpsynth.generate`)](index.md#1-in-memory-dataframe-api-dpsynthgenerate) + * [1. In-Memory DataFrame API (`dpsynth.TabularSynthesizer`)](index.md#1-in-memory-dataframe-api-dpsynth-tabularsynthesizer) * [2. Scalable PipelineBackend API (`dpsynth.data_generation`)](index.md#2-scalable-pipelinebackend-api-dpsynthdata_generation) * [Documentation Sitemap & Navigation](index.md#documentation-sitemap--navigation) * [Supported Synthesis Algorithms](index.md#supported-synthesis-algorithms) @@ -46,7 +46,7 @@
📁 In-Memory DataFrame API Guide -* [Python API: `dpsynth.generate`](in_memory_api.md#python-api-dpsynthgenerate) +* [Python API: `dpsynth.TabularSynthesizer`](in_memory_api.md#python-api-dpsynth-tabularsynthesizer) * [Function Signature](in_memory_api.md#function-signature) * [Key Arguments](in_memory_api.md#key-arguments) * [End-to-End Python Example](in_memory_api.md#end-to-end-python-example) diff --git a/dpsynth/data_generation_v2.py b/dpsynth/data_generation_v2.py index 06a6f8c..aeb8a6d 100644 --- a/dpsynth/data_generation_v2.py +++ b/dpsynth/data_generation_v2.py @@ -12,94 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Implementation of an end-to-end DP synthetic data generation mechanism. +"""Deprecated shim. -.. deprecated:: - This module is deprecated. Use - :class:`dpsynth.data_generation_v3.TabularSynthesizer` - instead. +Use :class:`dpsynth.data_generation_v3.TabularSynthesizer` instead. """ from collections.abc import Mapping, Sequence -from typing import TypeAlias import warnings -from absl import logging -import dp_accounting from dpsynth import constraints from dpsynth import discrete_mechanisms from dpsynth import domain -from dpsynth import transformations -from dpsynth.discrete_mechanisms import accounting -from dpsynth.discrete_mechanisms import common -from dpsynth.pipeline_transformations import categorical_values_derivation -from dpsynth.pipeline_transformations import dp_auto_discretizer -import mbi +from dpsynth.data_generation_v3 import TabularSynthesizer import numpy as np import pandas as pd -import pipeline_dp - -Dataset: TypeAlias = pd.DataFrame - - -def _compress_data(data, one_way_measurements): - """Compresses the domain and measurements if necessary.""" - compressed_domain, compressed_one_way_measurements, compress_transforms = ( - common.get_domain_compression_transformations(one_way_measurements) - ) - - total_measurement = common.convert_to_total_measurement(one_way_measurements) - - logging.info( - '[SynthKit Tabular]: Estimated Total %d', - total_measurement.noisy_measurement, - ) - compressed_data = mbi.Dataset( - transformations.apply(data.df, compress_transforms), - compressed_domain, - ) - logging.info('[SynthKit Tabular]: Original domain: %s', data.domain) - logging.info('[SynthKit Tabular]: Compressed domain: %s', compressed_domain) - - measurements = [total_measurement] + list(compressed_one_way_measurements) - return compressed_data, measurements, compress_transforms - - -def _compute_privacy_parameters( - epsilon: float, - delta: float, - one_way_marginal_budget_fraction: float, - discrete_config: discrete_mechanisms.DiscreteMechanism, -) -> tuple[float, float]: - """Compute privacy parameters for one-way marginals and discrete mechanism.""" - - one_way_marginal_sigma = dp_accounting.get_sigma_gaussian( - epsilon=one_way_marginal_budget_fraction * epsilon, - delta=one_way_marginal_budget_fraction * delta, - ) - one_way_marginal_gdp_mu = 1.0 / one_way_marginal_sigma**2 - - def make_event_from_param(zcdp_rho): - event1 = dp_accounting.GaussianDpEvent(one_way_marginal_sigma) - event2 = discrete_config.calibrate(zcdp_rho=zcdp_rho).dp_event - return dp_accounting.ComposedDpEvent([event1, event2]) - - if isinstance( - discrete_config.calibrate(zcdp_rho=1.0).dp_event, dp_accounting.ZCDpEvent - ): - make_fresh_accountant = dp_accounting.rdp.RdpAccountant - else: - make_fresh_accountant = dp_accounting.pld.PLDAccountant - - discrete_mechanism_zcdp_rho = dp_accounting.calibrate_dp_mechanism( - make_event_from_param=make_event_from_param, - target_epsilon=0.9 * epsilon, - target_delta=0.9 * delta, - make_fresh_accountant=make_fresh_accountant, - bracket_interval=dp_accounting.LowerEndpointAndGuess(1e-3, 1.0), - ) - - return one_way_marginal_gdp_mu, discrete_mechanism_zcdp_rho def generate( @@ -108,191 +34,31 @@ def generate( epsilon: float, delta: float, *, - discrete_config: discrete_mechanisms.DiscreteMechanism = discrete_mechanisms.MSTMechanism(), + discrete_config: ( + discrete_mechanisms.DiscreteMechanism + ) = discrete_mechanisms.MSTMechanism(), numerical_bins: int = 32, one_way_marginal_budget_fraction: float = 0.1, cross_attribute_constraints: Sequence[constraints.Constraint] = (), skip_compression: bool = False, ) -> pd.DataFrame: - """Generate synthetic data with record-level differential privacy. - - Ths function encodes the input categorical and numerical data into a - discrete domain, then runs the specified mechanism on the discretized data. - Finally, it converts the synthetic data back to the original domain. - - Args: - data: The dataset to generate synthetic data for. - domains: A mapping from column names to attribute domains. Every key in this - mapping must be a column of `data`. - epsilon: Privacy parameter. - delta: Privacy parameter. - discrete_config: The mechanism configuration for the discretized and - integer-encoded data. - numerical_bins: The number of bins to use for discretization. - one_way_marginal_budget_fraction: The fraction of the total privacy budget - to use for one-way marginal queries. - cross_attribute_constraints: Constraints to enforce on the generated data. - skip_compression: Whether to skip the domain compression step. - - Returns: - A synthetic dataset. - """ + """Deprecated. Use :class:`data_generation_v3.TabularSynthesizer` instead.""" warnings.warn( 'data_generation_v2.generate() is deprecated. Use' ' data_generation_v3.TabularSynthesizer instead.', DeprecationWarning, stacklevel=2, ) - assert 0 <= one_way_marginal_budget_fraction <= 1 - if not skip_compression and cross_attribute_constraints: - raise ValueError( - 'Compression is not supported when cross-attribute constraints are' - ' provided.' - ) - for col in domains: - if col not in data.columns: - raise ValueError( - f'{col=} not found in the dataset. Available columns: {data.columns}' - ) - if isinstance(domains[col], domain.FreeFormTextAttribute): - raise ValueError( - f'FreeFormTextAttribute is not supported for column {col!r}.' - ' Free-form text attributes cannot be synthesized by this mechanism.' - ) - - backend = pipeline_dp.LocalBackend() - - # only for initialization (numerical + unknown domain categorical) - accountant = pipeline_dp.NaiveBudgetAccountant(0.1 * epsilon, 0.1 * delta) - engine = pipeline_dp.DPEngine(accountant, backend) - # for remainder of mechanism, not going through pipeline_dp accounting - - one_way_marginal_gdp_mu, discrete_zcdp_rho = _compute_privacy_parameters( - 0.9 * epsilon, - 0.9 * delta, - one_way_marginal_budget_fraction, - discrete_config, - ) - - ################################################## - # Map the data to a standardized discrete domain # - ################################################## - transform_fns = {} - discrete_domains = {} - - numerical_attributes = { - col: dom - for col, dom in domains.items() - if isinstance(dom, domain.NumericalAttribute) - } - open_set_categorical_attributes = [ - col - for col, dom in domains.items() - if isinstance(dom, domain.OpenSetCategoricalAttribute) - ] - if numerical_attributes: - # dp_auto_discretizer does not currently handle empty dict here. - output_numerical = ( - dp_auto_discretizer.create_transformations_via_dp_quantiles( - pcol=(dict(s) for _, s, in data.iterrows()), - engine=engine, - backend=backend, - field_name_to_attribute=numerical_attributes, - num_quanitle_buckets=numerical_bins, - ) - ) - else: - output_numerical = None - - if open_set_categorical_attributes: - output_categorical = ( - categorical_values_derivation.derive_categorical_values( - input_data=(dict(s) for _, s, in data.iterrows()), - backend=backend, - dp_engine=engine, - attribute_keys_to_derive=list(open_set_categorical_attributes), - ) - ) - logging.info('output_categorical: %s', output_categorical) - else: - output_categorical = None - - accountant.compute_budgets() - if output_numerical is not None: - for field_name, cat_attr, to_categorical in output_numerical: - logging.info('Discretizing numerical column: %s', field_name) - to_standardized = transformations.discrete_encoder(cat_attr) - transform_fns[field_name] = to_standardized @ to_categorical - discrete_domains[field_name] = cat_attr.size - - if output_categorical is not None: - for field_name, cat_attr in list(output_categorical)[0].items(): - logging.info('Deriving categorical column: %s', field_name) - transform_fns[field_name] = transformations.discrete_encoder(cat_attr) - discrete_domains[field_name] = cat_attr.size - - categorical_attributes = { - col: dom - for col, dom in domains.items() - if isinstance(dom, domain.CategoricalAttribute) - } - for col, attr in categorical_attributes.items(): - logging.info('Encoding categorical column: %s', col) - transform_fns[col] = transformations.discrete_encoder(attr) - discrete_domains[col] = attr.size - - discrete = {} - for col in discrete_domains: - logging.info('Encoding categorical column: %s', col) - dtype = np.min_scalar_type(discrete_domains[col]) - values = data[col].map(transform_fns[col].transform).values - discrete[col] = values.astype(dtype) - - discrete = mbi.Dataset(discrete, mbi.Domain.fromdict(discrete_domains)) - - logging.info('[SynthKit Tabular]: Finished encoding data.') - - ####################################################################### - # Measure 1-way marginals and compress domain by merging rare values. # - ####################################################################### - one_way_marginal_queries = [(col,) for col in discrete.domain] - gdp_sigma = accounting.gdp_gaussian_sigma(one_way_marginal_gdp_mu) - rng = np.random.default_rng() - one_way_measurements = common.measure_marginals_with_noise( - rng, discrete, one_way_marginal_queries, gdp_sigma - ) - logging.info('[SynthKit Tabular]: Measured one-way marginals.') - - if not skip_compression: - discrete, one_way_measurements, compress_transforms = _compress_data( - discrete, one_way_measurements - ) - for col in compress_transforms: - transform_fns[col] = compress_transforms[col] @ transform_fns[col] - - # Run the mechanism on the discretized data. - initial_potentials = constraints.get_initial_parameters( - cross_attribute_constraints, discrete.domain + del skip_compression # Not supported by TabularSynthesizer. + synth = TabularSynthesizer( + domains=domains, + discrete_mechanism=discrete_config, + cross_attribute_constraints=cross_attribute_constraints, ) - - result = discrete_config.calibrate(zcdp_rho=discrete_zcdp_rho)( - rng, - data=discrete, - initial_measurements=one_way_measurements, - initial_potentials=initial_potentials, - ) - - synthetic_data = result.synthetic_data - logging.info('[SynthKit Tabular]: Generated discrete synthetic data.') - - # Convert synthetic data back to the original domain. - synthetic_columns = {} - for col in transform_fns: - synthetic_columns[col] = pd.Series( - [transform_fns[col].inverse(x) for x in synthetic_data.df[col]], - dtype=data[col].dtype, - ) - logging.info('[SynthKit Tabular]: Converted data back to original domain.') - - column_order = [col for col in data.columns if col in domains] - return pd.DataFrame(synthetic_columns)[column_order] + result = synth.calibrate( + epsilon=epsilon, + delta=delta, + numerical_bins=numerical_bins, + init_budget_fraction=one_way_marginal_budget_fraction, + )(np.random.default_rng(), data) + return result.synthetic_data diff --git a/tests/data_generation_v2_test.py b/tests/data_generation_v2_test.py index 412e65b..d94dc24 100644 --- a/tests/data_generation_v2_test.py +++ b/tests/data_generation_v2_test.py @@ -12,149 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Smoke test for the deprecated data_generation_v2 shim.""" + +import warnings + from absl.testing import absltest -from dpsynth import constraints from dpsynth import data_generation_v2 from dpsynth import domain import pandas as pd -class MechanismTest(absltest.TestCase): - - def test_end_to_end_categorical(self): - attribute_domains = { - "A": domain.CategoricalAttribute( - possible_values=["a", "b", "c"], out_of_domain_index=0 - ), - "B": domain.CategoricalAttribute( - possible_values=["x", "y", "z"], out_of_domain_index=0 - ), - "C": domain.OpenSetCategoricalAttribute(), - } - - values = [ - ["a", "x", "4"], - ["b", "y", "4"], - ["c", "z", "4"], - ] - - df = pd.DataFrame(data=values, columns=["A", "B", "C"]) - synthetic_df = data_generation_v2.generate( - df, - attribute_domains, - epsilon=100, - delta=0.1, - skip_compression=True, - ) - self.assertIsInstance(synthetic_df, pd.DataFrame) - - def test_end_to_end_numerical(self): - attribute_domains = { - "A": domain.NumericalAttribute(min_value=0, max_value=10), - "B": domain.NumericalAttribute(min_value=-10, max_value=10), - } - - values = [ - [5, 5], - [5, -10], - [0, -5], - ] +class DeprecationShimTest(absltest.TestCase): - df = pd.DataFrame(data=values, columns=["A", "B"], dtype=float) - synthetic_df = data_generation_v2.generate(df, attribute_domains, 1000, 0.1) - self.assertListEqual(synthetic_df.columns.tolist(), ["A", "B"]) - for col in attribute_domains: - dom = attribute_domains[col] - left, right = dom.min_value, dom.max_value - self.assertTrue(synthetic_df[col].between(left, right).all()) - - def test_end_to_end_categorical_with_constraint(self): + def test_generate_emits_deprecation_warning(self): attribute_domains = { - "A": domain.CategoricalAttribute( - possible_values=["a", "b", "c"], out_of_domain_index=0 - ), - "B": domain.CategoricalAttribute( - possible_values=["x", "y", "z"], out_of_domain_index=0 + 'A': domain.CategoricalAttribute( + possible_values=['a', 'b', 'c'], out_of_domain_index=0 ), } - - constraint = constraints.Constraint( - attribute_names=("A", "B"), - attribute_domains=( - attribute_domains["A"], - attribute_domains["B"], - ), - possible_combinations=[ - ("a", "x"), - ("b", "y"), - ("c", "z"), - ], - ) - - values = [ - ["a", "x"], - ["b", "y"], - ["c", "z"], - ["a", "y"], - ["b", "x"], - ["c", "x"], - ] - - df = pd.DataFrame(data=values, columns=["A", "B"]) - synthetic_df = data_generation_v2.generate( - df, - attribute_domains, - epsilon=1.0, - delta=1e-5, - discrete_config=data_generation_v2.discrete_mechanisms.MSTConfig(), - cross_attribute_constraints=[constraint], - skip_compression=True, - ) - - def is_valid(row): - return (row["A"], row["B"]) in constraint.possible_combinations - - self.assertTrue(synthetic_df.apply(is_valid, axis=1).all()) - - def test_end_to_end_mixed_domain(self): - attribute_domains = { - "A": domain.OpenSetCategoricalAttribute(), - "B": domain.NumericalAttribute(min_value=0, max_value=10), - } - - values = [ - ["a", 1], - ["b", 5], - ["c", 10], - ] - - df = pd.DataFrame(data=values, columns=["A", "B"]) - df["B"] = df["B"].astype(float) - synthetic_df = data_generation_v2.generate( - df, - attribute_domains, - epsilon=100, - delta=0.1, - skip_compression=True, - ) - self.assertIsInstance(synthetic_df, pd.DataFrame) - self.assertListEqual(synthetic_df.columns.tolist(), ["A", "B"]) - dom_b = attribute_domains["B"] - self.assertTrue( - synthetic_df["B"].between(dom_b.min_value, dom_b.max_value).all() - ) - - def test_raises_on_freeform_text_attribute(self): - attribute_domains = { - "A": domain.CategoricalAttribute(possible_values=["a", "b"]), - "text": domain.FreeFormTextAttribute(max_tokens=128), - } - df = pd.DataFrame({"A": ["a", "b"], "text": ["hello", "world"]}) - with self.assertRaises(ValueError): - data_generation_v2.generate( - df, attribute_domains, epsilon=1.0, delta=1e-5 + df = pd.DataFrame({'A': ['a', 'b', 'c']}) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + synthetic_df = data_generation_v2.generate( + df, attribute_domains, epsilon=100, delta=0.1, skip_compression=True ) + self.assertTrue(any(issubclass(x.category, DeprecationWarning) for x in w)) + self.assertIsInstance(synthetic_df, pd.DataFrame) -if __name__ == "__main__": +if __name__ == '__main__': absltest.main()