From 65b9b0a7fccc7507d5fbe9a1d667fd8f1b021494 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 25 Aug 2025 11:59:45 +0200 Subject: [PATCH] Update examples based on templates --- default_python/README.md | 50 +++++---------- default_python/conftest.py | 76 ----------------------- default_python/pyproject.toml | 16 +++-- default_python/scratch/exploration.ipynb | 2 +- default_python/src/default_python/main.py | 19 ++++-- default_python/src/dlt_pipeline.ipynb | 2 +- default_python/src/notebook.ipynb | 2 +- default_python/tests/main_test.py | 6 +- scripts/update_from_templates.sh | 8 +-- 9 files changed, 53 insertions(+), 128 deletions(-) delete mode 100644 default_python/conftest.py diff --git a/default_python/README.md b/default_python/README.md index c4f3a2b6..74a1f42d 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -2,39 +2,18 @@ The 'default_python' project was generated by using the default-python template. -For documentation on the Databricks Asset Bundles format use for this project, -and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. - ## Getting started -Choose how you want to work on this project: - -(a) Directly in your Databricks workspace, see - https://docs.databricks.com/dev-tools/bundles/workspace. - -(b) Locally with an IDE like Cursor or VS Code, see - https://docs.databricks.com/vscode-ext. - -(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html - - -Dependencies for this project should be installed using UV: +0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ -* Make sure you have the UV package manager installed. - It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. -* Run `uv sync --dev` to install the project's dependencies. +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html -# Using this project using the CLI - -The Databricks workspace and IDE extensions provide a graphical interface for working -with this project. It's also possible to interact with it directly using the CLI: - -1. Authenticate to your Databricks workspace, if you have not done so already: +2. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -2. To deploy a development copy of this project, type: +3. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -44,9 +23,9 @@ with this project. It's also possible to interact with it directly using the CLI This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. + You can find that job by opening your workpace and clicking on **Workflows**. -3. Similarly, to deploy a production copy, type: +4. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -56,12 +35,17 @@ with this project. It's also possible to interact with it directly using the CLI is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -4. To run a job or pipeline, use the "run" command: +5. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` - -5. Finally, to run tests locally, use `pytest`: - ``` - $ uv run pytest - ``` +6. Optionally, install the Databricks extension for Visual Studio code for local development from + https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your + virtual environment and setup Databricks Connect for running unit tests locally. + When not using these tools, consult your development environment's documentation + and/or the documentation for Databricks Connect for manually setting up your environment + (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). + +7. For documentation on the Databricks asset bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/default_python/conftest.py b/default_python/conftest.py deleted file mode 100644 index cf1d0978..00000000 --- a/default_python/conftest.py +++ /dev/null @@ -1,76 +0,0 @@ -"""This file configures pytest. - -This file is in the root since it can be used for tests in any place in this -project, including tests under resources/. -""" - -import os, sys, pathlib -from contextlib import contextmanager - - -try: - from databricks.connect import DatabricksSession - from databricks.sdk import WorkspaceClient - from pyspark.sql import SparkSession - import pytest -except ImportError: - raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") - - -def add_all_resources_to_sys_path(): - """Add all resources/* directories to sys.path for module discovery.""" - resources = pathlib.Path(__file__).with_name("resources") - resource_dirs = filter(pathlib.Path.is_dir, resources.iterdir()) - seen: dict[str, pathlib.Path] = {} - for resource in resource_dirs: - sys.path.append(str(resource.resolve())) - for py in resource.rglob("*.py"): - mod = ".".join(py.relative_to(resource).with_suffix("").parts) - if mod in seen: - raise ImportError(f"Duplicate module '{mod}' found:\n {seen[mod]}\n {py}") - seen[mod] = py - - -def enable_fallback_compute(): - """Enable serverless compute if no compute is specified.""" - conf = WorkspaceClient().config - if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): - return - - url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" - print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) - print(f" see {url} for manual configuration", file=sys.stdout) - - os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" - - -@contextmanager -def allow_stderr_output(config: pytest.Config): - """Temporarily disable pytest output capture.""" - capman = config.pluginmanager.get_plugin("capturemanager") - if capman: - with capman.global_and_fixture_disabled(): - yield - else: - yield - - -def pytest_configure(config: pytest.Config): - """Configure pytest session.""" - with allow_stderr_output(config): - add_all_resources_to_sys_path() - enable_fallback_compute() - - # Initialize Spark session eagerly, so it is available even when - # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, - # we validate version compatibility with the remote cluster. - if hasattr(DatabricksSession.builder, "validateSession"): - DatabricksSession.builder.validateSession().getOrCreate() - else: - DatabricksSession.builder.getOrCreate() - - -@pytest.fixture(scope="session") -def spark() -> SparkSession: - """Provide a SparkSession fixture for tests.""" - return DatabricksSession.builder.getOrCreate() diff --git a/default_python/pyproject.toml b/default_python/pyproject.toml index dda79245..d32e108a 100644 --- a/default_python/pyproject.toml +++ b/default_python/pyproject.toml @@ -4,7 +4,7 @@ version = "0.0.1" authors = [{ name = "user@company.com" }] requires-python = ">= 3.11" -[dependency-groups] +[project.optional-dependencies] dev = [ "pytest", @@ -12,10 +12,16 @@ dev = [ "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # Note that for local development, you should use a version that is not newer - # than the remote cluster or serverless compute you connect to. - # See also https://docs.databricks.com/dev-tools/databricks-connect.html. - "databricks-connect>=15.4,<15.5", + # See https://docs.databricks.com/dev-tools/databricks-connect.html. + # + # Note, databricks-connect is automatically installed if you're using Databricks + # extension for Visual Studio Code + # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). + # + # To manually install databricks-connect, uncomment the line below to install a version + # of db-connect that corresponds to the Databricks Runtime version used for this project. + # See https://docs.databricks.com/dev-tools/databricks-connect.html + # "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb index 57a9c978..f7832011 100644 --- a/default_python/scratch/exploration.ipynb +++ b/default_python/scratch/exploration.ipynb @@ -32,7 +32,7 @@ "sys.path.append(\"../src\")\n", "from default_python import main\n", "\n", - "main.get_taxis().show(10)" + "main.get_taxis(spark).show(10)" ] } ], diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py index 04e8be4d..5ae344c7 100644 --- a/default_python/src/default_python/main.py +++ b/default_python/src/default_python/main.py @@ -1,13 +1,24 @@ -from databricks.sdk.runtime import spark -from pyspark.sql import DataFrame +from pyspark.sql import SparkSession, DataFrame -def find_all_taxis() -> DataFrame: +def get_taxis(spark: SparkSession) -> DataFrame: return spark.read.table("samples.nyctaxi.trips") +# Create a new Databricks Connect session. If this fails, +# check that you have configured Databricks Connect correctly. +# See https://docs.databricks.com/dev-tools/databricks-connect.html. +def get_spark() -> SparkSession: + try: + from databricks.connect import DatabricksSession + + return DatabricksSession.builder.getOrCreate() + except ImportError: + return SparkSession.builder.getOrCreate() + + def main(): - find_all_taxis().show(5) + get_taxis(get_spark()).show(5) if __name__ == "__main__": diff --git a/default_python/src/dlt_pipeline.ipynb b/default_python/src/dlt_pipeline.ipynb index 34e1895e..eb93d319 100644 --- a/default_python/src/dlt_pipeline.ipynb +++ b/default_python/src/dlt_pipeline.ipynb @@ -56,7 +56,7 @@ "source": [ "@dlt.view\n", "def taxi_raw():\n", - " return main.find_all_taxis()\n", + " return main.get_taxis(spark)\n", "\n", "\n", "@dlt.table\n", diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb index fd49e5b9..fe99fd32 100644 --- a/default_python/src/notebook.ipynb +++ b/default_python/src/notebook.ipynb @@ -46,7 +46,7 @@ "source": [ "from default_python import main\n", "\n", - "main.find_all_taxis().show(10)" + "main.get_taxis(spark).show(10)" ] } ], diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py index 66c27024..66b6f0a2 100644 --- a/default_python/tests/main_test.py +++ b/default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from default_python import main +from default_python.main import get_taxis, get_spark -def test_find_all_taxis(): - taxis = main.find_all_taxis() +def test_main(): + taxis = get_taxis(get_spark()) assert taxis.count() > 5 diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh index a8b74fcf..c56a4b82 100755 --- a/scripts/update_from_templates.sh +++ b/scripts/update_from_templates.sh @@ -50,12 +50,12 @@ if [ ! "$DATABRICKS_HOST" ]; then exit 1 fi -if [ -n "$1" ]; then +# Prompt for CURRENT_USER_NAME if not passed as first arg +if [ -n "${1-}" ]; then CURRENT_USER_NAME="$1" else - read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME - read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME - if [ ! "$CURRENT_USER_NAME" ]; then + read -r -p "Enter the current user name of your 'DEFAULT' profile (e.g., 'lennart_kats'): " CURRENT_USER_NAME + if [ -z "${CURRENT_USER_NAME:-}" ]; then echo "Error: current user name is required." >&2 exit 1 fi