Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,104 @@ For a connection to a SQL endpoint you need to use the HTTP path from the endpoi
{"connect_args": {"http_path": "/sql/1.0/endpoints/****", "driver_path": "/path/to/odbc/driver"}}
```

##### OAuth2 Authentication

Superset supports OAuth2 authentication for Databricks, allowing users to authenticate with their personal Databricks accounts instead of using shared access tokens. This provides better security and audit capabilities.

###### Prerequisites

1. Create an OAuth2 application in your Databricks account:
- Go to your Databricks account console
- Navigate to **Settings** → **Developer** → **OAuth apps**
- Create a new OAuth app with the redirect URI: `http://your-superset-host:port/api/v1/database/oauth2/`

2. Configure OAuth2 in your `superset_config.py`:

```python
from datetime import timedelta

# OAuth2 configuration for Databricks
# OAuth2 endpoints are automatically detected based on your Databricks cloud provider
DATABASE_OAUTH2_CLIENTS = {
"Databricks (legacy)": {
"id": "your-databricks-client-id",
"secret": "your-databricks-client-secret",
"scope": "sql",
# The authorization endpoint is auto-detected from the hostname; the
# token endpoint must be set explicitly (no DB context at exchange):
# AWS: "authorization_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{account_id}/v1/authorize",
# Azure: "authorization_request_uri": "https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/authorize",
# GCP: "authorization_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{account_id}/v1/authorize",
# "token_request_uri": "https://<provider-token-endpoint>",
},
"Databricks": {
"id": "your-databricks-client-id",
"secret": "your-databricks-client-secret",
"scope": "sql",
# Authorization endpoint auto-detected from hostname; set
# "token_request_uri" explicitly for the token exchange.
},
}

# OAuth2 redirect URI (adjust hostname/port for your setup)
DATABASE_OAUTH2_REDIRECT_URI = "http://your-superset-host:port/api/v1/database/oauth2/"

# Optional: OAuth2 timeout
DATABASE_OAUTH2_TIMEOUT = timedelta(seconds=30)
```

Replace the following placeholders:
- `your-databricks-client-id`: Your Databricks OAuth2 application client ID
- `your-databricks-client-secret`: Your Databricks OAuth2 application client secret
- `your-superset-host:port`: Your Superset instance hostname and port

**Multi-Cloud Provider Support**

Superset automatically detects your Databricks cloud provider and uses the appropriate OAuth2 endpoints:

- **AWS**: Detected from hostnames containing `cloud.databricks.com`
- **Azure**: Detected from hostnames containing `azure` or `azuredatabricks`
- **GCP**: Detected from hostnames containing `gcp` or `googleusercontent`

You can also explicitly specify the cloud provider, along with the account
identifier used to build the OAuth2 endpoints, in your database configuration
under **Advanced** → **Other** → **ENGINE PARAMETERS**:

```json
{
"cloud_provider": "azure",
"tenant_id": "your-azure-tenant-id"
}
```

For AWS and GCP, supply `account_id` instead:

```json
{
"cloud_provider": "aws",
"account_id": "your-databricks-account-id"
}
```

Valid cloud provider values are: `aws`, `azure`, `gcp`. The **authorization**
endpoint is auto-detected: Superset substitutes this identifier into the
provider's authorization template. The **token** endpoint is not auto-resolved
(token exchange has no database context to detect the provider), so for the
auto-detected flow you must still supply a fully-resolved `token_request_uri`
in `DATABASE_OAUTH2_CLIENTS`. If you supply fully-resolved
`authorization_request_uri` and `token_request_uri` values, those take
precedence and no `account_id`/`tenant_id` is required.

###### Usage

Once configured, users can:

1. Connect to Databricks databases normally using access tokens
2. When querying data, Superset will automatically redirect users to authenticate with Databricks if needed
3. User-specific OAuth2 tokens will be used for database connections, providing better security and audit trails

This feature works with both "Databricks (legacy)" and "Databricks" engine types and automatically supports all major cloud providers (AWS, Azure, GCP).

#### Denodo

The recommended connector library for Denodo is
Expand Down
238 changes: 238 additions & 0 deletions superset/db_engine_specs/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,18 @@
)
from superset.db_engine_specs.hive import HiveEngineSpec
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import OAuth2Error, OAuth2RedirectError
from superset.utils import json
from superset.utils.core import get_user_agent, QuerySource
from superset.utils.network import is_hostname_valid, is_port_open

if TYPE_CHECKING:
from superset.models.core import Database
from superset.superset_typing import (
OAuth2ClientConfig,
OAuth2State,
OAuth2TokenResponse,
)


try:
Expand Down Expand Up @@ -277,6 +283,102 @@ class DatabricksDynamicBaseEngineSpec(BasicParametersMixin, DatabricksBaseEngine
"port": "port",
}

# OAuth2 endpoints for different cloud providers
_oauth2_endpoints = {
"aws": {
"authorization_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{}/v1/authorize",
"token_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{}/v1/token",
},
"azure": {
"authorization_request_uri": "https://login.microsoftonline.com/{}/oauth2/v2.0/authorize",
"token_request_uri": "https://login.microsoftonline.com/{}/oauth2/v2.0/token",
},
"gcp": {
"authorization_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{}/v1/authorize",
"token_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{}/v1/token",
},
}

@classmethod
def _detect_cloud_provider(cls, database: Database) -> str:
"""
Detect the cloud provider based on the database configuration.

Returns:
str: The cloud provider ('aws', 'azure', or 'gcp')
"""
# Check if cloud provider is explicitly configured in extra
if "cloud_provider" in (extra := cls.get_extra_params(database)):
provider = extra["cloud_provider"].lower()
if provider in cls._oauth2_endpoints:
return provider

# Try to detect from hostname
hostname = database.url_object.host or ""
hostname = hostname.lower()

if "azure" in hostname or "azuredatabricks" in hostname:
return "azure"
elif "gcp" in hostname or "googleusercontent" in hostname:
return "gcp"
else:
# Default to AWS for compatibility
return "aws"

@classmethod
def _resolve_oauth2_endpoint(
cls,
database: Database,
provider: str,
endpoint_key: str,
) -> str:
"""
Build a fully-resolved OAuth2 endpoint for the detected cloud provider.

The per-provider templates carry a single ``{}`` placeholder for the
Databricks account id (or Azure tenant id), read from the database's
``extra`` (``account_id``, or ``tenant_id`` for Azure). Raising when it
is absent keeps the flow from issuing a request to an unresolved
``.../{}/...`` endpoint.
"""
template = cls._oauth2_endpoints[provider][endpoint_key]
if "{}" not in template:
return template

extra = cls.get_extra_params(database)
account_id = extra.get("account_id") or extra.get("tenant_id")
if not account_id:
raise OAuth2Error(
"Databricks OAuth2 endpoints could not be resolved: set "
"`account_id` (or `tenant_id` for Azure) in the database's "
"engine parameters, or provide a fully-resolved "
f"`{endpoint_key}` in DATABASE_OAUTH2_CLIENTS."
)
return template.format(account_id)

@classmethod
def impersonate_user(
cls,
database: Database,
username: str | None,
user_token: str | None,
url: URL,
engine_kwargs: dict[str, Any],
) -> tuple[URL, dict[str, Any]]:
"""
Update connection with OAuth2 access token for user impersonation.
"""
if user_token:
# Replace the access token in the URL with the user's OAuth2 token
url = url.set(password=user_token)

# Also update connect_args if they contain access token
connect_args = engine_kwargs.setdefault("connect_args", {})
if "access_token" in connect_args:
connect_args["access_token"] = user_token

return url, engine_kwargs

@staticmethod
def get_extra_params(
database: Database, source: QuerySource | None = None
Expand Down Expand Up @@ -474,6 +576,74 @@ class DatabricksNativeEngineSpec(DatabricksDynamicBaseEngineSpec):
supports_dynamic_catalog = True
supports_cross_catalog_queries = True

# OAuth 2.0 support
supports_oauth2 = True
oauth2_exception = OAuth2RedirectError
oauth2_scope = "sql"
Comment thread
rusackas marked this conversation as resolved.

# OAuth2 endpoints are determined dynamically based on cloud provider
oauth2_authorization_request_uri = "" # Set dynamically
oauth2_token_request_uri = "" # Set dynamically

@classmethod
def get_oauth2_authorization_uri(
cls,
config: "OAuth2ClientConfig",
state: "OAuth2State",
code_verifier: str | None = None,
) -> str:
"""
Return URI for initial OAuth2 request with dynamic endpoint detection.

A fully-resolved `authorization_request_uri` from `DATABASE_OAUTH2_CLIENTS`
is preserved; only fall back to the auto-detected, account-resolved
endpoint when none is configured.
"""
if not config.get("authorization_request_uri"):
from superset import db
from superset.models.core import Database

# Get the database to detect cloud provider
database_id = state["database_id"]
if database := db.session.get(Database, database_id):
provider = cls._detect_cloud_provider(database)
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {
"authorization_request_uri": cls._resolve_oauth2_endpoint(
database, provider, "authorization_request_uri"
)
},
)

return super().get_oauth2_authorization_uri(config, state, code_verifier)

@classmethod
def get_oauth2_token(
cls,
config: "OAuth2ClientConfig",
code: str,
code_verifier: str | None = None,
) -> "OAuth2TokenResponse":
"""
Exchange authorization code for refresh/access tokens.

The token request URI is resolved when the OAuth2 config is built (see
`get_oauth2_config`) and already targets the correct cloud provider and
account. There is no database context here to auto-detect it, so fail
fast rather than POST to an unresolved endpoint when it is missing.
"""
if not config.get("token_request_uri"):
raise OAuth2Error(
"Databricks OAuth2 token endpoint is not configured: provide a "
"fully-resolved `token_request_uri` in DATABASE_OAUTH2_CLIENTS."
)
Comment thread
rusackas marked this conversation as resolved.

return super().get_oauth2_token(config, code, code_verifier)
Comment thread
rusackas marked this conversation as resolved.

@classmethod
def build_sqlalchemy_uri( # type: ignore
cls, parameters: DatabricksNativeParametersType, *_
Expand Down Expand Up @@ -685,6 +855,74 @@ class DatabricksPythonConnectorEngineSpec(DatabricksDynamicBaseEngineSpec):

supports_dynamic_schema = supports_catalog = supports_dynamic_catalog = True

# OAuth 2.0 support
supports_oauth2 = True
oauth2_exception = OAuth2RedirectError
oauth2_scope = "sql"

# OAuth2 endpoints are determined dynamically based on cloud provider
oauth2_authorization_request_uri = "" # Set dynamically
oauth2_token_request_uri = "" # Set dynamically

@classmethod
def get_oauth2_authorization_uri(
cls,
config: "OAuth2ClientConfig",
state: "OAuth2State",
code_verifier: str | None = None,
) -> str:
"""
Return URI for initial OAuth2 request with dynamic endpoint detection.

A fully-resolved `authorization_request_uri` from `DATABASE_OAUTH2_CLIENTS`
is preserved; only fall back to the auto-detected, account-resolved
endpoint when none is configured.
"""
if not config.get("authorization_request_uri"):
from superset import db
from superset.models.core import Database

# Get the database to detect cloud provider
database_id = state["database_id"]
if database := db.session.get(Database, database_id):
provider = cls._detect_cloud_provider(database)
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {
"authorization_request_uri": cls._resolve_oauth2_endpoint(
database, provider, "authorization_request_uri"
)
},
)

return super().get_oauth2_authorization_uri(config, state, code_verifier)

@classmethod
def get_oauth2_token(
cls,
config: "OAuth2ClientConfig",
code: str,
code_verifier: str | None = None,
) -> "OAuth2TokenResponse":
"""
Exchange authorization code for refresh/access tokens.

The token request URI is resolved when the OAuth2 config is built (see
`get_oauth2_config`) and already targets the correct cloud provider and
account. There is no database context here to auto-detect it, so fail
fast rather than POST to an unresolved endpoint when it is missing.
"""
if not config.get("token_request_uri"):
raise OAuth2Error(
"Databricks OAuth2 token endpoint is not configured: provide a "
"fully-resolved `token_request_uri` in DATABASE_OAUTH2_CLIENTS."
)
Comment thread
rusackas marked this conversation as resolved.

return super().get_oauth2_token(config, code, code_verifier)
Comment thread
rusackas marked this conversation as resolved.

@classmethod
def build_sqlalchemy_uri( # type: ignore
cls, parameters: DatabricksPythonConnectorParametersType, *_
Expand Down
Loading
Loading