diff --git a/.ruff.toml b/.ruff.toml index 6845a35c..25bebc2a 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -5,4 +5,5 @@ exclude = [ "default_python/*", "default_sql/*", "mlops_stacks/*", + "lakeflow_pipelines_python/*", ] diff --git a/contrib/data_engineering/README.md b/contrib/data_engineering/README.md index 549f2941..d3433fc4 100644 --- a/contrib/data_engineering/README.md +++ b/contrib/data_engineering/README.md @@ -1,20 +1,6 @@ # data_engineering -The 'data_engineering' project was generated by using the contrib/data-engineering template. - -Learn more about this template here: - -https://github.com/databricks/bundle-examples/tree/main/contrib/templates/data-engineering - -You can re-create this bundle by running the following commands: - -``` -$ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering -# (answer prompts, call the project data_engineering) -$ cd data_engineering -$ uv run add-asset -# (select etl-pipeline) -``` +The 'data_engineering' project was generated by using the data-engineering template. ## Setup diff --git a/contrib/data_engineering/assets/etl_pipeline/__init__.py b/contrib/data_engineering/assets/etl_pipeline/__init__.py deleted file mode 100644 index 67f4c4ca..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# This is the entry point for the {{.pipeline_name}} pipeline. -# It makes sure all transformations in the transformations directory are included. -import transformations - -__all__ = ["transformations"] diff --git a/contrib/data_engineering/assets/etl_pipeline/etl_pipeline.pipeline.yml b/contrib/data_engineering/assets/etl_pipeline/etl_pipeline.pipeline.yml deleted file mode 100644 index 3a36f0e5..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/etl_pipeline.pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - etl_pipeline: - name: etl_pipeline - serverless: true - catalog: ${var.catalog} - target: ${var.schema} - libraries: - - file: - path: sources/${bundle.target}/*.py - - file: - path: __init__.py diff --git a/contrib/data_engineering/assets/etl_pipeline/sources/dev/taxis.py b/contrib/data_engineering/assets/etl_pipeline/sources/dev/taxis.py deleted file mode 100644 index 1fba2e27..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/sources/dev/taxis.py +++ /dev/null @@ -1,8 +0,0 @@ -import dlt -from pyspark.sql import DataFrame -from databricks.sdk.runtime import spark - - -@dlt.view(comment="Small set of taxis for development (uses LIMIT 10)") -def taxis() -> DataFrame: - return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") diff --git a/contrib/data_engineering/assets/etl_pipeline/sources/prod/taxis.py b/contrib/data_engineering/assets/etl_pipeline/sources/prod/taxis.py deleted file mode 100644 index 15ce56a0..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/sources/prod/taxis.py +++ /dev/null @@ -1,8 +0,0 @@ -import dlt -from pyspark.sql import DataFrame -from databricks.sdk.runtime import spark - - -@dlt.view -def taxis() -> DataFrame: - return spark.sql("SELECT * FROM samples.nyctaxi.trips") diff --git a/contrib/data_engineering/assets/etl_pipeline/tests/taxi_stats_test.py b/contrib/data_engineering/assets/etl_pipeline/tests/taxi_stats_test.py deleted file mode 100644 index b0c4449c..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/tests/taxi_stats_test.py +++ /dev/null @@ -1,7 +0,0 @@ -from ..sources.dev.taxis import taxis -from ..transformations import taxi_stats - - -def test_taxi_stats(): - result = taxi_stats.filter_taxis(taxis()) - assert len(result.collect()) > 5 diff --git a/contrib/data_engineering/assets/etl_pipeline/transformations/__init__.py b/contrib/data_engineering/assets/etl_pipeline/transformations/__init__.py deleted file mode 100644 index 80577db0..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/transformations/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# __init__.py defines the 'transformations' Python package -import importlib -import pkgutil - - -# Import all modules in the package except those starting with '_', like '__init__.py' -for _, module_name, _ in pkgutil.iter_modules(__path__): - if not module_name.startswith("_"): - importlib.import_module(f"{__name__}.{module_name}") diff --git a/contrib/data_engineering/assets/etl_pipeline/transformations/taxi_stats.py b/contrib/data_engineering/assets/etl_pipeline/transformations/taxi_stats.py deleted file mode 100644 index 5c5dcd9a..00000000 --- a/contrib/data_engineering/assets/etl_pipeline/transformations/taxi_stats.py +++ /dev/null @@ -1,20 +0,0 @@ -import dlt -from pyspark.sql.functions import to_date, count -from pyspark.sql import DataFrame - - -@dlt.table(comment="Daily statistics of NYC Taxi trips") -def taxi_stats() -> DataFrame: - """Read from the 'taxis' view from etl_pipeline/sources.""" - taxis = dlt.read("taxis") - - return filter_taxis(taxis) - - -def filter_taxis(taxis: DataFrame) -> DataFrame: - """Group by date and calculate the number of trips.""" - return ( - taxis.withColumn("pickup_date", to_date("tpep_pickup_datetime")) - .groupBy("pickup_date") - .agg(count("*").alias("number_of_trips")) - ) diff --git a/contrib/data_engineering/databricks.yml b/contrib/data_engineering/databricks.yml index 23454029..0577aa4b 100644 --- a/contrib/data_engineering/databricks.yml +++ b/contrib/data_engineering/databricks.yml @@ -26,21 +26,21 @@ targets: workspace: host: https://company.databricks.com variables: - catalog: main + catalog: catalog schema: ${workspace.current_user.short_name} notifications: [] prod: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Workspace/Users/user@databricks.com to make sure we only have a single copy. - root_path: /Workspace/Users/user@databricks.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly specify /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} permissions: - - user_name: user@databricks.com + - user_name: user@company.com level: CAN_MANAGE run_as: - user_name: user@databricks.com + user_name: user@company.com variables: - catalog: main + catalog: catalog schema: default - notifications: [user@databricks.com] \ No newline at end of file + notifications: [user@company.com] \ No newline at end of file diff --git a/contrib/data_engineering/uv.lock b/contrib/data_engineering/uv.lock deleted file mode 100644 index 86c538ef..00000000 --- a/contrib/data_engineering/uv.lock +++ /dev/null @@ -1,412 +0,0 @@ -version = 1 -requires-python = "==3.10.*" - -[[package]] -name = "cachetools" -version = "5.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/38/a0f315319737ecf45b4319a8cd1f3a908e29d9277b46942263292115eee7/cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a", size = 27661 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292", size = 9524 }, -] - -[[package]] -name = "certifi" -version = "2024.12.14" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 }, -] - -[[package]] -name = "charset-normalizer" -version = "3.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/58/5580c1716040bc89206c77d8f74418caf82ce519aae06450393ca73475d1/charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de", size = 198013 }, - { url = "https://files.pythonhosted.org/packages/d0/11/00341177ae71c6f5159a08168bcb98c6e6d196d372c94511f9f6c9afe0c6/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176", size = 141285 }, - { url = "https://files.pythonhosted.org/packages/01/09/11d684ea5819e5a8f5100fb0b38cf8d02b514746607934134d31233e02c8/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037", size = 151449 }, - { url = "https://files.pythonhosted.org/packages/08/06/9f5a12939db324d905dc1f70591ae7d7898d030d7662f0d426e2286f68c9/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f", size = 143892 }, - { url = "https://files.pythonhosted.org/packages/93/62/5e89cdfe04584cb7f4d36003ffa2936681b03ecc0754f8e969c2becb7e24/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a", size = 146123 }, - { url = "https://files.pythonhosted.org/packages/a9/ac/ab729a15c516da2ab70a05f8722ecfccc3f04ed7a18e45c75bbbaa347d61/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a", size = 147943 }, - { url = "https://files.pythonhosted.org/packages/03/d2/3f392f23f042615689456e9a274640c1d2e5dd1d52de36ab8f7955f8f050/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247", size = 142063 }, - { url = "https://files.pythonhosted.org/packages/f2/e3/e20aae5e1039a2cd9b08d9205f52142329f887f8cf70da3650326670bddf/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408", size = 150578 }, - { url = "https://files.pythonhosted.org/packages/8d/af/779ad72a4da0aed925e1139d458adc486e61076d7ecdcc09e610ea8678db/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb", size = 153629 }, - { url = "https://files.pythonhosted.org/packages/c2/b6/7aa450b278e7aa92cf7732140bfd8be21f5f29d5bf334ae987c945276639/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d", size = 150778 }, - { url = "https://files.pythonhosted.org/packages/39/f4/d9f4f712d0951dcbfd42920d3db81b00dd23b6ab520419626f4023334056/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807", size = 146453 }, - { url = "https://files.pythonhosted.org/packages/49/2b/999d0314e4ee0cff3cb83e6bc9aeddd397eeed693edb4facb901eb8fbb69/charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f", size = 95479 }, - { url = "https://files.pythonhosted.org/packages/2d/ce/3cbed41cff67e455a386fb5e5dd8906cdda2ed92fbc6297921f2e4419309/charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f", size = 102790 }, - { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, -] - -[[package]] -name = "databricks-connect" -version = "15.1.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "databricks-sdk" }, - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "grpcio-status" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pandas" }, - { name = "py4j" }, - { name = "pyarrow" }, - { name = "six" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/62/d95d56cb13bbcfd320ceb70786bbd0396a49cfd864aa8668115f3d28cb10/databricks_connect-15.1.3-py2.py3-none-any.whl", hash = "sha256:a576b31a0716ff66d68ff6bfcbd64ecafa6313c9ff76835af3ee7cd05e708a72", size = 2244900 }, -] - -[[package]] -name = "databricks-dlt" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "databricks-connect" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/88/ff/8ba06d337f78e43ad15830edb27b8d4b92a533403d147f98280e932f99a3/databricks-dlt-0.3.0.tar.gz", hash = "sha256:c755ac015e87095c5a67710bf82e870e6de76130bb2a962046ae913f21af03cc", size = 11084 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/16/260f89a35b6f08ada9bd4a6b6e1dabdc3c4034ebceddae5c9e1f146a0167/databricks_dlt-0.3.0-py3-none-any.whl", hash = "sha256:232f97e0fc93bae7bbc100f17880fd88c2777c92cbf002e5b5e7810572df2c4a", size = 11171 }, -] - -[[package]] -name = "databricks-sdk" -version = "0.40.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f6/f2/913357137260f22e269c91465e44d2364755765a5445a85e94551c22f83e/databricks_sdk-0.40.0.tar.gz", hash = "sha256:48c6926ab840bd49e200122bccd72d9e7c823030949fd96a97d903df4fe2c2e7", size = 648096 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/10/f8/190dce0b219596fe2cc0bb93b95e0498de17ce593aeb052949518287c15f/databricks_sdk-0.40.0-py3-none-any.whl", hash = "sha256:998a3d118b89abdfd7151a9f0f6065a865a3f84d6ba434118175f4e456d5fa73", size = 629735 }, -] - -[[package]] -name = "exceptiongroup" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, -] - -[[package]] -name = "google-auth" -version = "2.37.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cachetools" }, - { name = "pyasn1-modules" }, - { name = "rsa" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/46/af/b25763b9d35dfc2c6f9c3ec34d8d3f1ba760af3a7b7e8d5c5f0579522c45/google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00", size = 268878 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/8d/4d5d5f9f500499f7bd4c93903b43e8d6976f3fc6f064637ded1a85d09b07/google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0", size = 209829 }, -] - -[[package]] -name = "googleapis-common-protos" -version = "1.66.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ff/a7/8e9cccdb1c49870de6faea2a2764fa23f627dd290633103540209f03524c/googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c", size = 114376 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/0f/c0713fb2b3d28af4b2fded3291df1c4d4f79a00d15c2374a9e010870016c/googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed", size = 221682 }, -] - -[[package]] -name = "grpcio" -version = "1.69.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e4/87/06a145284cbe86c91ca517fe6b57be5efbb733c0d6374b407f0992054d18/grpcio-1.69.0.tar.gz", hash = "sha256:936fa44241b5379c5afc344e1260d467bee495747eaf478de825bab2791da6f5", size = 12738244 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/6e/2f8ee5fb65aef962d0bd7e46b815e7b52820687e29c138eaee207a688abc/grpcio-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2060ca95a8db295ae828d0fc1c7f38fb26ccd5edf9aa51a0f44251f5da332e97", size = 5190753 }, - { url = "https://files.pythonhosted.org/packages/89/07/028dcda44d40f9488f0a0de79c5ffc80e2c1bc5ed89da9483932e3ea67cf/grpcio-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2e52e107261fd8fa8fa457fe44bfadb904ae869d87c1280bf60f93ecd3e79278", size = 11096752 }, - { url = "https://files.pythonhosted.org/packages/99/a0/c727041b1410605ba38b585b6b52c1a289d7fcd70a41bccbc2c58fc643b2/grpcio-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:316463c0832d5fcdb5e35ff2826d9aa3f26758d29cdfb59a368c1d6c39615a11", size = 5705442 }, - { url = "https://files.pythonhosted.org/packages/7a/2f/1c53f5d127ff882443b19c757d087da1908f41c58c4b098e8eaf6b2bb70a/grpcio-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:26c9a9c4ac917efab4704b18eed9082ed3b6ad19595f047e8173b5182fec0d5e", size = 6333796 }, - { url = "https://files.pythonhosted.org/packages/cc/f6/2017da2a1b64e896af710253e5bfbb4188605cdc18bce3930dae5cdbf502/grpcio-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b3646ced2eae3a0599658eeccc5ba7f303bf51b82514c50715bdd2b109e5ec", size = 5954245 }, - { url = "https://files.pythonhosted.org/packages/c1/65/1395bec928e99ba600464fb01b541e7e4cdd462e6db25259d755ef9f8d02/grpcio-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3b75aea7c6cb91b341c85e7c1d9db1e09e1dd630b0717f836be94971e015031e", size = 6664854 }, - { url = "https://files.pythonhosted.org/packages/40/57/8b3389cfeb92056c8b44288c9c4ed1d331bcad0215c4eea9ae4629e156d9/grpcio-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5cfd14175f9db33d4b74d63de87c64bb0ee29ce475ce3c00c01ad2a3dc2a9e51", size = 6226854 }, - { url = "https://files.pythonhosted.org/packages/cc/61/1f2bbeb7c15544dffc98b3f65c093e746019995e6f1e21dc3655eec3dc23/grpcio-1.69.0-cp310-cp310-win32.whl", hash = "sha256:9031069d36cb949205293cf0e243abd5e64d6c93e01b078c37921493a41b72dc", size = 3662734 }, - { url = "https://files.pythonhosted.org/packages/ef/ba/bf1a6d9f5c17d2da849793d72039776c56c98c889c9527f6721b6ee57e6e/grpcio-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:cc89b6c29f3dccbe12d7a3b3f1b3999db4882ae076c1c1f6df231d55dbd767a5", size = 4410306 }, -] - -[[package]] -name = "grpcio-status" -version = "1.69.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/02/35/52dc0d8300f879dbf9cdc95764cee9f56d5a212998cfa1a8871b262df2a4/grpcio_status-1.69.0.tar.gz", hash = "sha256:595ef84e5178d6281caa732ccf68ff83259241608d26b0e9c40a5e66eee2a2d2", size = 13662 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/e2/346a766a4232f74f45f8bc70e636fc3a6677e6bc3893382187829085f12e/grpcio_status-1.69.0-py3-none-any.whl", hash = "sha256:d6b2a3c9562c03a817c628d7ba9a925e209c228762d6d7677ae5c9401a542853", size = 14428 }, -] - -[[package]] -name = "idna" -version = "3.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, -] - -[[package]] -name = "iniconfig" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, -] - -[[package]] -name = "my-data-project" -version = "0.1.0" -source = { editable = "." } -dependencies = [ - { name = "databricks-connect" }, - { name = "databricks-dlt" }, - { name = "pytest" }, -] - -[package.metadata] -requires-dist = [ - { name = "databricks-connect", specifier = "==15.1.*" }, - { name = "databricks-dlt" }, - { name = "pytest" }, -] - -[[package]] -name = "numpy" -version = "1.26.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/94/ace0fdea5241a27d13543ee117cbc65868e82213fb31a8eb7fe9ff23f313/numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0", size = 20631468 }, - { url = "https://files.pythonhosted.org/packages/20/f7/b24208eba89f9d1b58c1668bc6c8c4fd472b20c45573cb767f59d49fb0f6/numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a", size = 13966411 }, - { url = "https://files.pythonhosted.org/packages/fc/a5/4beee6488160798683eed5bdb7eead455892c3b4e1f78d79d8d3f3b084ac/numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4", size = 14219016 }, - { url = "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f", size = 18240889 }, - { url = "https://files.pythonhosted.org/packages/24/03/6f229fe3187546435c4f6f89f6d26c129d4f5bed40552899fcf1f0bf9e50/numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a", size = 13876746 }, - { url = "https://files.pythonhosted.org/packages/39/fe/39ada9b094f01f5a35486577c848fe274e374bbf8d8f472e1423a0bbd26d/numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2", size = 18078620 }, - { url = "https://files.pythonhosted.org/packages/d5/ef/6ad11d51197aad206a9ad2286dc1aac6a378059e06e8cf22cd08ed4f20dc/numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07", size = 5972659 }, - { url = "https://files.pythonhosted.org/packages/19/77/538f202862b9183f54108557bfda67e17603fc560c384559e769321c9d92/numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5", size = 15808905 }, -] - -[[package]] -name = "packaging" -version = "24.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, -] - -[[package]] -name = "pandas" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "python-dateutil" }, - { name = "pytz" }, - { name = "tzdata" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, - { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, - { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, - { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, - { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, - { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, - { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, -] - -[[package]] -name = "pluggy" -version = "1.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, -] - -[[package]] -name = "protobuf" -version = "5.29.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f7/d1/e0a911544ca9993e0f17ce6d3cc0932752356c1b0a834397f28e63479344/protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620", size = 424945 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/7a/1e38f3cafa022f477ca0f57a1f49962f21ad25850c3ca0acd3b9d0091518/protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888", size = 422708 }, - { url = "https://files.pythonhosted.org/packages/61/fa/aae8e10512b83de633f2646506a6d835b151edf4b30d18d73afd01447253/protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a", size = 434508 }, - { url = "https://files.pythonhosted.org/packages/dd/04/3eaedc2ba17a088961d0e3bd396eac764450f431621b58a04ce898acd126/protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e", size = 417825 }, - { url = "https://files.pythonhosted.org/packages/4f/06/7c467744d23c3979ce250397e26d8ad8eeb2bea7b18ca12ad58313c1b8d5/protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84", size = 319573 }, - { url = "https://files.pythonhosted.org/packages/a8/45/2ebbde52ad2be18d3675b6bee50e68cd73c9e0654de77d595540b5129df8/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f", size = 319672 }, - { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 }, -] - -[[package]] -name = "py4j" -version = "0.10.9.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1e/f2/b34255180c72c36ff7097f7c2cdca02abcbd89f5eebf7c7c41262a9a0637/py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb", size = 1508234 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/10/30/a58b32568f1623aaad7db22aa9eafc4c6c194b429ff35bdc55ca2726da47/py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b", size = 200481 }, -] - -[[package]] -name = "pyarrow" -version = "18.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/bb/8d4a1573f66e0684f190dd2b55fd0b97a7214de8882d58a3867e777bf640/pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c", size = 29531620 }, - { url = "https://files.pythonhosted.org/packages/30/90/893acfad917533b624a97b9e498c0e8393908508a0a72d624fe935e632bf/pyarrow-18.1.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b516dad76f258a702f7ca0250885fc93d1fa5ac13ad51258e39d402bd9e2e1e4", size = 30836521 }, - { url = "https://files.pythonhosted.org/packages/a3/2a/526545a7464b5fb2fa6e2c4bad16ca90e59e1843025c534fd907b7f73e5a/pyarrow-18.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f443122c8e31f4c9199cb23dca29ab9427cef990f283f80fe15b8e124bcc49b", size = 39213905 }, - { url = "https://files.pythonhosted.org/packages/8a/77/4b3fab91a30e19e233e738d0c5eca5a8f6dd05758bc349a2ca262c65de79/pyarrow-18.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0a03da7f2758645d17b7b4f83c8bffeae5bbb7f974523fe901f36288d2eab71", size = 40128881 }, - { url = "https://files.pythonhosted.org/packages/aa/e2/a88e16c5e45e562449c52305bd3bc2f9d704295322d3434656e7ccac1444/pyarrow-18.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ba17845efe3aa358ec266cf9cc2800fa73038211fb27968bfa88acd09261a470", size = 38627517 }, - { url = "https://files.pythonhosted.org/packages/6d/84/8037c20005ccc7b869726465be0957bd9c29cfc88612962030f08292ad06/pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3c35813c11a059056a22a3bef520461310f2f7eea5c8a11ef9de7062a23f8d56", size = 40060187 }, - { url = "https://files.pythonhosted.org/packages/2a/38/d6435c723ff73df8ae74626ea778262fbcc2b9b0d1a4f3db915b61711b05/pyarrow-18.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9736ba3c85129d72aefa21b4f3bd715bc4190fe4426715abfff90481e7d00812", size = 25118314 }, -] - -[[package]] -name = "pyasn1" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135 }, -] - -[[package]] -name = "pyasn1-modules" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1d/67/6afbf0d507f73c32d21084a79946bfcfca5fbc62a72057e9c23797a737c9/pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c", size = 310028 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, -] - -[[package]] -name = "pytest" -version = "8.3.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "exceptiongroup" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "tomli" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 }, -] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, -] - -[[package]] -name = "pytz" -version = "2024.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 }, -] - -[[package]] -name = "requests" -version = "2.32.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "idna" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, -] - -[[package]] -name = "rsa" -version = "4.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/aa/65/7d973b89c4d2351d7fb232c2e452547ddfa243e93131e7cfa766da627b52/rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21", size = 29711 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/97/fa78e3d2f65c02c8e1268b9aba606569fe97f6c8f7c2d74394553347c145/rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", size = 34315 }, -] - -[[package]] -name = "six" -version = "1.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, -] - -[[package]] -name = "tomli" -version = "2.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 }, -] - -[[package]] -name = "tzdata" -version = "2024.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e1/34/943888654477a574a86a98e9896bae89c7aa15078ec29f490fef2f1e5384/tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc", size = 193282 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, -] - -[[package]] -name = "urllib3" -version = "2.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, -] diff --git a/dbt_sql/.gitignore b/dbt_sql/.gitignore index cced6581..23116291 100644 --- a/dbt_sql/.gitignore +++ b/dbt_sql/.gitignore @@ -11,4 +11,5 @@ scratch/** # dbt target/ dbt_packages/ +dbt_modules/ logs/ diff --git a/dbt_sql/.vscode/settings.json b/dbt_sql/.vscode/settings.json index 82860360..a9355ea8 100644 --- a/dbt_sql/.vscode/settings.json +++ b/dbt_sql/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ diff --git a/dbt_sql/README.md b/dbt_sql/README.md index 7c2e6069..f012a898 100644 --- a/dbt_sql/README.md +++ b/dbt_sql/README.md @@ -121,7 +121,7 @@ You can find that job by opening your workpace and clicking on **Workflows**. You can also deploy to your production target directly from the command-line. The warehouse, catalog, and schema for that target are configured in databricks.yml. -When deploying to this target, note that the default job at resources/dbt_sql_job.yml +When deploying to this target, note that the default job at resources/dbt_sql.job.yml has a schedule set that runs every day. The schedule is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). diff --git a/dbt_sql/databricks.yml b/dbt_sql/databricks.yml index 5741351a..62de07b7 100644 --- a/dbt_sql/databricks.yml +++ b/dbt_sql/databricks.yml @@ -3,20 +3,22 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: dbt_sql + uuid: 5e5ca8d5-0388-473e-84a1-1414ed89c5df include: - resources/*.yml + - resources/*/*.yml # Deployment targets. # The default schema, catalog, etc. for dbt are defined in dbt_profiles/profiles.yml targets: dev: - default: true # The default target uses 'mode: development' to create a development copy. # - Deployed resources get prefixed with '[dev my_user_name]' # - Any job schedules and triggers are paused by default. # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. mode: development + default: true workspace: host: https://company.databricks.com @@ -24,10 +26,8 @@ targets: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/dbt_sql/dbt_profiles/profiles.yml b/dbt_sql/dbt_profiles/profiles.yml index 34e98a59..0d757ae6 100644 --- a/dbt_sql/dbt_profiles/profiles.yml +++ b/dbt_sql/dbt_profiles/profiles.yml @@ -1,38 +1,38 @@ # This file defines dbt profiles for deployed dbt jobs. dbt_sql: - target: dev # default target - outputs: - - # Doing local development with the dbt CLI? - # Then you should create your own profile in your .dbt/profiles.yml using 'dbt init' - # (See README.md) - - # The default target when deployed with the Databricks CLI - # N.B. when you use dbt from the command line, it uses the profile from .dbt/profiles.yml - dev: - type: databricks - method: http - catalog: main - schema: "{{ var('dev_schema') }}" - - http_path: /sql/1.0/warehouses/abcdef1234567890 - - # The workspace host / token are provided by Databricks - # see databricks.yml for the workspace host used for 'dev' - host: "{{ env_var('DBT_HOST') }}" - token: "{{ env_var('DBT_ACCESS_TOKEN') }}" - - # The production target when deployed with the Databricks CLI - prod: - type: databricks - method: http - catalog: main - schema: default - - http_path: /sql/1.0/warehouses/abcdef1234567890 - - # The workspace host / token are provided by Databricks - # see databricks.yml for the workspace host used for 'prod' - host: "{{ env_var('DBT_HOST') }}" - token: "{{ env_var('DBT_ACCESS_TOKEN') }}" + target: dev # default target + outputs: + + # Doing local development with the dbt CLI? + # Then you should create your own profile in your .dbt/profiles.yml using 'dbt init' + # (See README.md) + + # The default target when deployed with the Databricks CLI + # N.B. when you use dbt from the command line, it uses the profile from .dbt/profiles.yml + dev: + type: databricks + method: http + catalog: catalog + schema: "{{ var('dev_schema') }}" + + http_path: /sql/1.0/warehouses/abcdef1234567890 + + # The workspace host / token are provided by Databricks + # see databricks.yml for the workspace host used for 'dev' + host: "{{ env_var('DBT_HOST') }}" + token: "{{ env_var('DBT_ACCESS_TOKEN') }}" + + # The production target when deployed with the Databricks CLI + prod: + type: databricks + method: http + catalog: catalog + schema: default + + http_path: /sql/1.0/warehouses/abcdef1234567890 + + # The workspace host / token are provided by Databricks + # see databricks.yml for the workspace host used for 'prod' + host: "{{ env_var('DBT_HOST') }}" + token: "{{ env_var('DBT_ACCESS_TOKEN') }}" diff --git a/dbt_sql/dbt_project.yml b/dbt_sql/dbt_project.yml index 0c979c75..947412e3 100644 --- a/dbt_sql/dbt_project.yml +++ b/dbt_sql/dbt_project.yml @@ -15,7 +15,7 @@ seed-paths: ["src/seeds"] macro-paths: ["src/macros"] snapshot-paths: ["src/snapshots"] -clean-targets: # directories to be removed by `dbt clean` +clean-targets: # directories to be removed by `dbt clean` - "target" - "dbt_packages" diff --git a/dbt_sql/profile_template.yml b/dbt_sql/profile_template.yml index 1ddbeaf3..e6f2e69d 100644 --- a/dbt_sql/profile_template.yml +++ b/dbt_sql/profile_template.yml @@ -5,7 +5,7 @@ fixed: type: databricks prompts: host: - default: myworkspace.databricks.com + default: company.databricks.com token: hint: 'personal access token to use, dapiXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' hide_input: true @@ -14,7 +14,7 @@ prompts: default: /sql/1.0/warehouses/abcdef1234567890 catalog: hint: 'initial catalog' - default: main + default: catalog schema: hint: 'personal schema where dbt will build objects during development, example: user_name' threads: diff --git a/dbt_sql/resources/dbt_sql_job.yml b/dbt_sql/resources/dbt_sql.job.yml similarity index 55% rename from dbt_sql/resources/dbt_sql_job.yml rename to dbt_sql/resources/dbt_sql.job.yml index a25dc1ab..db1d1d43 100644 --- a/dbt_sql/resources/dbt_sql_job.yml +++ b/dbt_sql/resources/dbt_sql.job.yml @@ -9,26 +9,25 @@ resources: interval: 1 unit: DAYS - email_notifications: - on_failure: - - user@company.com + #email_notifications: + # on_failure: + # - your_email@example.com tasks: - task_key: dbt - dbt_task: project_directory: ../ # The default schema, catalog, etc. are defined in ../dbt_profiles/profiles.yml profiles_directory: dbt_profiles/ commands: - # The dbt commands to run (see also dbt_profiles/profiles.yml; dev_schema is used in the dev profile) - - 'dbt deps --target=${bundle.target}' - - 'dbt seed --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' - - 'dbt run --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' + # The dbt commands to run (see also dbt_profiles/profiles.yml; dev_schema is used in the dev profile) + - 'dbt deps --target=${bundle.target}' + - 'dbt seed --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' + - 'dbt run --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' libraries: - - pypi: - package: dbt-databricks>=1.8.0,<2.0.0 + - pypi: + package: dbt-databricks>=1.8.0,<2.0.0 new_cluster: spark_version: 15.4.x-scala2.12 @@ -36,7 +35,7 @@ resources: data_security_mode: SINGLE_USER num_workers: 0 spark_conf: - spark.master: "local[*, 4]" - spark.databricks.cluster.profile: singleNode + spark.master: "local[*, 4]" + spark.databricks.cluster.profile: singleNode custom_tags: ResourceClass: SingleNode diff --git a/default_python/.vscode/settings.json b/default_python/.vscode/settings.json index f19498da..8ee87c30 100644 --- a/default_python/.vscode/settings.json +++ b/default_python/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ diff --git a/default_python/README.md b/default_python/README.md index 3f11022c..c4f3a2b6 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -2,16 +2,39 @@ The 'default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + + +Dependencies for this project should be installed using UV: -2. Authenticate to your Databricks workspace, if you have not done so already: +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -21,27 +44,24 @@ The 'default_python' project was generated by using the default-python template. This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` Note that the default job from the template has a schedule that runs every day - (defined in resources/default_python_job.yml). The schedule + (defined in resources/default_python.job.yml). The schedule is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for - **Databricks Connect** for instructions on running the included Python code from a different IDE. - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/default_python/conftest.py b/default_python/conftest.py new file mode 100644 index 00000000..cf1d0978 --- /dev/null +++ b/default_python/conftest.py @@ -0,0 +1,76 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +def add_all_resources_to_sys_path(): + """Add all resources/* directories to sys.path for module discovery.""" + resources = pathlib.Path(__file__).with_name("resources") + resource_dirs = filter(pathlib.Path.is_dir, resources.iterdir()) + seen: dict[str, pathlib.Path] = {} + for resource in resource_dirs: + sys.path.append(str(resource.resolve())) + for py in resource.rglob("*.py"): + mod = ".".join(py.relative_to(resource).with_suffix("").parts) + if mod in seen: + raise ImportError(f"Duplicate module '{mod}' found:\n {seen[mod]}\n {py}") + seen[mod] = py + + +def enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with allow_stderr_output(config): + add_all_resources_to_sys_path() + enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() + + +@pytest.fixture(scope="session") +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests.""" + return DatabricksSession.builder.getOrCreate() diff --git a/default_python/databricks.yml b/default_python/databricks.yml index 0e9d0ced..079edb90 100644 --- a/default_python/databricks.yml +++ b/default_python/databricks.yml @@ -2,9 +2,16 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: default_python + uuid: 87d5a23e-7bc7-4f52-98ee-e374b67d5681 + +artifacts: + python_artifact: + type: whl + build: uv build --wheel include: - resources/*.yml + - resources/*/*.yml targets: dev: @@ -21,10 +28,8 @@ targets: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/default_python/pyproject.toml b/default_python/pyproject.toml new file mode 100644 index 00000000..dda79245 --- /dev/null +++ b/default_python/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "default_python" +version = "0.0.1" +authors = [{ name = "user@company.com" }] +requires-python = ">= 3.11" + +[dependency-groups] +dev = [ + "pytest", + + # Code completion support for DLT, also install databricks-connect + "databricks-dlt", + + # databricks-connect can be used to run parts of this project locally. + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", +] + +[tool.pytest.ini_options] +pythonpath = "src" +testpaths = [ + "tests", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/default_python"] + +[project.scripts] +main = "default_python.main:main" diff --git a/default_python/pytest.ini b/default_python/pytest.ini deleted file mode 100644 index 80432c22..00000000 --- a/default_python/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -testpaths = tests -pythonpath = src diff --git a/default_python/requirements-dev.txt b/default_python/requirements-dev.txt deleted file mode 100644 index 0ffbf6ae..00000000 --- a/default_python/requirements-dev.txt +++ /dev/null @@ -1,29 +0,0 @@ -## requirements-dev.txt: dependencies for local development. -## -## For defining dependencies used by jobs in Databricks Workflows, see -## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - -## Add code completion support for DLT -databricks-dlt - -## pytest is the default package used for testing -pytest - -## Dependencies for building wheel files -setuptools -wheel - -## databricks-connect can be used to run parts of this project locally. -## See https://docs.databricks.com/dev-tools/databricks-connect.html. -## -## databricks-connect is automatically installed if you're using Databricks -## extension for Visual Studio Code -## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). -## -## To manually install databricks-connect, either follow the instructions -## at https://docs.databricks.com/dev-tools/databricks-connect.html -## to install the package system-wide. Or uncomment the line below to install a -## version of db-connect that corresponds to the Databricks Runtime version used -## for this project. -# -# databricks-connect>=15.4,<15.5 diff --git a/default_python/resources/default_python.job.yml b/default_python/resources/default_python.job.yml new file mode 100644 index 00000000..0504090a --- /dev/null +++ b/default_python/resources/default_python.job.yml @@ -0,0 +1,45 @@ +# The main job for default_python. +resources: + jobs: + default_python_job: + name: default_python_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + tasks: + - task_key: notebook_task + notebook_task: + notebook_path: ../src/notebook.ipynb + + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_pipeline.id} + + - task_key: main_task + depends_on: + - task_key: refresh_pipeline + environment_key: default + python_wheel_task: + package_name: default_python + entry_point: main + + # A list of task execution environment specifications that can be referenced by tasks of this job. + environments: + - environment_key: default + + # Full documentation of this spec can be found at: + # https://docs.databricks.com/api/workspace/jobs/create#environments-spec + spec: + client: "2" + dependencies: + - ../dist/*.whl diff --git a/default_python/resources/default_python_pipeline.yml b/default_python/resources/default_python.pipeline.yml similarity index 67% rename from default_python/resources/default_python_pipeline.yml rename to default_python/resources/default_python.pipeline.yml index be61e864..ea7cdc02 100644 --- a/default_python/resources/default_python_pipeline.yml +++ b/default_python/resources/default_python.pipeline.yml @@ -4,10 +4,11 @@ resources: default_python_pipeline: name: default_python_pipeline catalog: main - target: default_python_${bundle.environment} + schema: default_python_${bundle.target} + serverless: true libraries: - notebook: path: ../src/dlt_pipeline.ipynb configuration: - bundle.sourcePath: /Workspace/${workspace.file_path}/src + bundle.sourcePath: ${workspace.file_path}/src diff --git a/default_python/resources/default_python_job.yml b/default_python/resources/default_python_job.yml deleted file mode 100644 index cbe9382a..00000000 --- a/default_python/resources/default_python_job.yml +++ /dev/null @@ -1,53 +0,0 @@ -# The main job for default_python. -resources: - jobs: - default_python_job: - name: default_python_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - email_notifications: - on_failure: - - user@company.com - - tasks: - - task_key: notebook_task - job_cluster_key: job_cluster - notebook_task: - notebook_path: ../src/notebook.ipynb - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.default_python_pipeline.id} - - - task_key: main_task - depends_on: - - task_key: refresh_pipeline - job_cluster_key: job_cluster - python_wheel_task: - package_name: default_python - entry_point: main - libraries: - # By default we just include the .whl file generated for the default_python package. - # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - # for more information on how to add other libraries. - - whl: ../dist/*.whl - - job_clusters: - - job_cluster_key: job_cluster - new_cluster: - spark_version: 15.4.x-scala2.12 - # node_type_id is the cluster node type to use. - # Typical node types on AWS include i3.xlarge; - # Standard_D3_v2 on Azure; - # n1-standard-4 on Google Cloud. - node_type_id: i3.xlarge - autoscale: - min_workers: 1 - max_workers: 4 diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb index 8be3b84e..57a9c978 100644 --- a/default_python/scratch/exploration.ipynb +++ b/default_python/scratch/exploration.ipynb @@ -28,10 +28,11 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../src')\n", + "\n", + "sys.path.append(\"../src\")\n", "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/default_python/setup.py b/default_python/setup.py deleted file mode 100644 index 6ba5b7b4..00000000 --- a/default_python/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -setup.py configuration script describing how to build and package this project. - -This file is primarily used by the setuptools library and typically should not -be executed directly. See README.md for how to deploy, test, and run -the default_python project. -""" -from setuptools import setup, find_packages - -import sys -sys.path.append('./src') - -import datetime -import default_python - -setup( - name="default_python", - # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) - # to ensure that changes to wheel package are picked up when used on all-purpose clusters - version=default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"), - url="https://databricks.com", - author="user@company.com", - description="wheel file based on default_python/src", - packages=find_packages(where='./src'), - package_dir={'': 'src'}, - entry_points={ - "packages": [ - "main=default_python.main:main" - ] - }, - install_requires=[ - # Dependencies in case the output wheel file is used as a library dependency. - # For defining dependencies, when this package is used in Databricks, see: - # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - "setuptools" - ], -) diff --git a/default_python/src/default_python/__init__.py b/default_python/src/default_python/__init__.py index f102a9ca..e69de29b 100644 --- a/default_python/src/default_python/__init__.py +++ b/default_python/src/default_python/__init__.py @@ -1 +0,0 @@ -__version__ = "0.0.1" diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py index c514c6dc..04e8be4d 100644 --- a/default_python/src/default_python/main.py +++ b/default_python/src/default_python/main.py @@ -1,21 +1,14 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") +def find_all_taxis() -> DataFrame: + return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) -if __name__ == '__main__': - main() + +if __name__ == "__main__": + main() diff --git a/default_python/src/dlt_pipeline.ipynb b/default_python/src/dlt_pipeline.ipynb index 4216a065..34e1895e 100644 --- a/default_python/src/dlt_pipeline.ipynb +++ b/default_python/src/dlt_pipeline.ipynb @@ -14,7 +14,7 @@ "source": [ "# DLT pipeline\n", "\n", - "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/default_python_pipeline.yml." + "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/default_python.pipeline.yml." ] }, { @@ -34,6 +34,7 @@ "# Import DLT and src/default_python\n", "import dlt\n", "import sys\n", + "\n", "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", "from pyspark.sql.functions import expr\n", "from default_python import main" @@ -55,11 +56,12 @@ "source": [ "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis(spark)\n", + " return main.find_all_taxis()\n", + "\n", "\n", "@dlt.table\n", "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" + " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" ] } ], diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb index daa4e08c..fd49e5b9 100644 --- a/default_python/src/notebook.ipynb +++ b/default_python/src/notebook.ipynb @@ -14,7 +14,7 @@ "source": [ "# Default notebook\n", "\n", - "This default notebook is executed using Databricks Workflows as defined in resources/default_python_job.yml." + "This default notebook is executed using Databricks Workflows as defined in resources/default_python.job.yml." ] }, { @@ -46,7 +46,7 @@ "source": [ "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" ] } ], diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py index 66b6f0a2..66c27024 100644 --- a/default_python/tests/main_test.py +++ b/default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from default_python.main import get_taxis, get_spark +from default_python import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5 diff --git a/default_sql/.vscode/settings.json b/default_sql/.vscode/settings.json index 0c9d4119..1b55565d 100644 --- a/default_sql/.vscode/settings.json +++ b/default_sql/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ @@ -17,7 +16,7 @@ "sqltools.connections": [ { "connectionMethod": "VS Code Extension (beta)", - "catalog": "main", + "catalog": "catalog", "previewLimit": 50, "driver": "Databricks", "name": "databricks", diff --git a/default_sql/databricks.yml b/default_sql/databricks.yml index 19d68680..715c2ebe 100644 --- a/default_sql/databricks.yml +++ b/default_sql/databricks.yml @@ -2,9 +2,11 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: default_sql + uuid: 853cd9bc-631c-4d4f-bca0-3195c7540854 include: - resources/*.yml + - resources/*/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -27,21 +29,19 @@ targets: host: https://company.databricks.com variables: warehouse_id: abcdef1234567890 - catalog: main + catalog: catalog schema: ${workspace.current_user.short_name} prod: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} variables: warehouse_id: abcdef1234567890 - catalog: main + catalog: catalog schema: default permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/default_sql/resources/default_sql_sql_job.yml b/default_sql/resources/default_sql_sql.job.yml similarity index 100% rename from default_sql/resources/default_sql_sql_job.yml rename to default_sql/resources/default_sql_sql.job.yml diff --git a/default_sql/src/orders_daily.sql b/default_sql/src/orders_daily.sql index 18d6d296..101bdd67 100644 --- a/default_sql/src/orders_daily.sql +++ b/default_sql/src/orders_daily.sql @@ -1,4 +1,4 @@ --- This query is executed using Databricks Workflows (see resources/default_sql_sql_job.yml) +-- This query is executed using Databricks Workflows (see resources/default_sql_sql.job.yml) USE CATALOG {{catalog}}; USE IDENTIFIER({{schema}}); diff --git a/default_sql/src/orders_raw.sql b/default_sql/src/orders_raw.sql index 88fe4298..dfb1ce69 100644 --- a/default_sql/src/orders_raw.sql +++ b/default_sql/src/orders_raw.sql @@ -1,4 +1,4 @@ --- This query is executed using Databricks Workflows (see resources/default_sql_sql_job.yml) +-- This query is executed using Databricks Workflows (see resources/default_sql_sql.job.yml) -- -- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/ -- See also https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-streaming-table.html diff --git a/lakeflow_pipelines_python/.gitignore b/lakeflow_pipelines_python/.gitignore new file mode 100644 index 00000000..f6a3b5ff --- /dev/null +++ b/lakeflow_pipelines_python/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/lakeflow_pipelines_python/.vscode/__builtins__.pyi b/lakeflow_pipelines_python/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/lakeflow_pipelines_python/.vscode/extensions.json b/lakeflow_pipelines_python/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/lakeflow_pipelines_python/.vscode/settings.json b/lakeflow_pipelines_python/.vscode/settings.json new file mode 100644 index 00000000..47d90b62 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/lakeflow_pipelines_python_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/lakeflow_pipelines_python/README.md b/lakeflow_pipelines_python/README.md new file mode 100644 index 00000000..e727cdbc --- /dev/null +++ b/lakeflow_pipelines_python/README.md @@ -0,0 +1,41 @@ +# lakeflow_pipelines_python + +The 'lakeflow_pipelines_python' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/lakeflow_pipelines_python/databricks.yml b/lakeflow_pipelines_python/databricks.yml new file mode 100644 index 00000000..5438327d --- /dev/null +++ b/lakeflow_pipelines_python/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for lakeflow_pipelines_python. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: lakeflow_pipelines_python + uuid: 87a174ba-60e4-4867-a140-1936bc9b00de + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: user@company.com + level: CAN_MANAGE + variables: + catalog: catalog + schema: default + notifications: [user@company.com] diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md new file mode 100644 index 00000000..5e845f08 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md @@ -0,0 +1,22 @@ +# lakeflow_pipelines_python_pipeline + +This folder defines all source code for the lakeflow_pipelines_python_pipeline pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `utilities` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_lakeflow_pipelines_python.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml new file mode 100644 index 00000000..c003b37f --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml @@ -0,0 +1,19 @@ +# The job that triggers lakeflow_pipelines_python_pipeline. +resources: + jobs: + lakeflow_pipelines_python_job: + name: lakeflow_pipelines_python_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_pipelines_python_pipeline.id} diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml new file mode 100644 index 00000000..3db75519 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + lakeflow_pipelines_python_pipeline: + name: lakeflow_pipelines_python_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py new file mode 100644 index 00000000..f0db7161 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col +from utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_lakeflow_pipelines_python(): + return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py new file mode 100644 index 00000000..a978db9b --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_lakeflow_pipelines_python(): + # Read from the "sample_trips" table, then sum all the fares + return spark.read.table("sample_trips_lakeflow_pipelines_python").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py new file mode 100644 index 00000000..ff039898 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py @@ -0,0 +1,8 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType + + +@udf(returnType=FloatType()) +def distance_km(distance_miles): + """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" + return distance_miles * 1.60934 diff --git a/lakeflow_pipelines_sql/.gitignore b/lakeflow_pipelines_sql/.gitignore new file mode 100644 index 00000000..f6a3b5ff --- /dev/null +++ b/lakeflow_pipelines_sql/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/lakeflow_pipelines_sql/.vscode/__builtins__.pyi b/lakeflow_pipelines_sql/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/lakeflow_pipelines_sql/.vscode/extensions.json b/lakeflow_pipelines_sql/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/lakeflow_pipelines_sql/.vscode/settings.json b/lakeflow_pipelines_sql/.vscode/settings.json new file mode 100644 index 00000000..d0c85bb8 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/lakeflow_pipelines_sql_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/lakeflow_pipelines_sql/README.md b/lakeflow_pipelines_sql/README.md new file mode 100644 index 00000000..b4a17db3 --- /dev/null +++ b/lakeflow_pipelines_sql/README.md @@ -0,0 +1,41 @@ +# lakeflow_pipelines_sql + +The 'lakeflow_pipelines_sql' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/lakeflow_pipelines_sql/databricks.yml b/lakeflow_pipelines_sql/databricks.yml new file mode 100644 index 00000000..4beb0c58 --- /dev/null +++ b/lakeflow_pipelines_sql/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for lakeflow_pipelines_sql. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: lakeflow_pipelines_sql + uuid: 295000fc-1ea8-4f43-befe-d5fb9f7d4ad4 + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: user@company.com + level: CAN_MANAGE + variables: + catalog: catalog + schema: default + notifications: [user@company.com] diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md new file mode 100644 index 00000000..d01f290a --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md @@ -0,0 +1,21 @@ +# lakeflow_pipelines_sql_pipeline + +This folder defines all source code for the 'lakeflow_pipelines_sql_pipeline' pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_lakeflow_pipelines_sql.sql" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/contrib/data_engineering/assets/etl_pipeline/etl_pipeline.job.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml similarity index 64% rename from contrib/data_engineering/assets/etl_pipeline/etl_pipeline.job.yml rename to lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml index 3792434e..32ba1ce4 100644 --- a/contrib/data_engineering/assets/etl_pipeline/etl_pipeline.job.yml +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml @@ -1,8 +1,8 @@ -# The job that triggers etl_pipeline. +# The job that triggers lakeflow_pipelines_sql_pipeline. resources: jobs: - etl_pipeline_job: - name: etl_pipeline_job + lakeflow_pipelines_sql_job: + name: lakeflow_pipelines_sql_job trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger @@ -16,4 +16,4 @@ resources: tasks: - task_key: refresh_pipeline pipeline_task: - pipeline_id: ${resources.pipelines.etl_pipeline.id} \ No newline at end of file + pipeline_id: ${resources.pipelines.lakeflow_pipelines_sql_pipeline.id} diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml new file mode 100644 index 00000000..781c9fd6 --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + lakeflow_pipelines_sql_pipeline: + name: lakeflow_pipelines_sql_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql new file mode 100644 index 00000000..09dda0bf --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_lakeflow_pipelines_sql AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql new file mode 100644 index 00000000..5f5c567d --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_lakeflow_pipelines_sql AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_lakeflow_pipelines_sql +GROUP BY pickup_zip diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh new file mode 100755 index 00000000..a8b74fcf --- /dev/null +++ b/scripts/update_from_templates.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +set -euo pipefail + +function cleanup() { + cd "$1" + local BUNDLE_UUID="$2" + + # Replace specific names with company.databricks.com, user@company.com, user_name + find . -type f -exec sed -i '' -E 's|e2[^[:space:]]*\.com|company.databricks.com|g' {} \; + find . -type f -exec sed -i '' -E 's|[A-Za-z0-9._%+-]+@databricks\.com|user@company.com|g' {} \; + find . -type f -exec sed -i '' -e "s|$CURRENT_USER_NAME|user_name|g" {} \; + find . -type f -exec sed -i '' -E "s|^([[:space:]]*uuid:[[:space:]]*)[^[:space:]]*[[:space:]]*$|\\1$BUNDLE_UUID|g" {} \; + + cd .. +} + +function init_bundle() { + local TEMPLATE_NAME="$1" + local BUNDLE_UUID="${2:-}" + local CONFIG_JSON="$3" + + # Extract project_name from JSON + local PROJECT_NAME=$(echo "$CONFIG_JSON" | grep -o '"project_name"[[:space:]]*:[[:space:]]*"[^"]*"' | cut -d'"' -f4) + + # Use 'cli' if available, otherwise fall back to 'databricks' + local CLI_CMD="databricks" + if command -v cli >/dev/null 2>&1; then + CLI_CMD="cli" + fi + + echo + echo "# $PROJECT_NAME" + + rm -rf "$PROJECT_NAME" + echo "$CONFIG_JSON" > /tmp/config.json + $CLI_CMD bundle init "$TEMPLATE_NAME" --config-file /tmp/config.json + cleanup "$PROJECT_NAME" "$BUNDLE_UUID" +} + +# Check and extract the host from the databrickscfg file +if [ ! -f ~/.databrickscfg ]; then + echo "Error: ~/.databrickscfg not found." >&2 + exit 1 +fi + +DATABRICKS_HOST=$(grep -A1 '\[DEFAULT\]' ~/.databrickscfg | grep 'host' | awk -F'=' '{print $2}' | xargs || true) +if [ ! "$DATABRICKS_HOST" ]; then + echo "Error: expected ~/.databrickscfg file with a [DEFAULT] section with the first line of the form 'host=...'." >&2 + exit 1 +fi + +if [ -n "$1" ]; then + CURRENT_USER_NAME="$1" +else + read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME + read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME + if [ ! "$CURRENT_USER_NAME" ]; then + echo "Error: current user name is required." >&2 + exit 1 + fi +fi + +cd $(dirname $0)/.. + +init_bundle "default-python" "87d5a23e-7bc7-4f52-98ee-e374b67d5681" '{ + "project_name": "default_python", + "include_notebook": "yes", + "include_dlt": "yes", + "include_python": "yes", + "serverless": "yes" +}' + +init_bundle "default-sql" "853cd9bc-631c-4d4f-bca0-3195c7540854" '{ + "project_name": "default_sql", + "http_path": "/sql/1.0/warehouses/abcdef1234567890", + "default_catalog": "catalog", + "personal_schemas": "yes, automatically use a schema based on the current user name during development" +}' + +init_bundle "dbt-sql" "5e5ca8d5-0388-473e-84a1-1414ed89c5df" '{ + "project_name": "dbt_sql", + "http_path": "/sql/1.0/warehouses/abcdef1234567890", + "default_catalog": "catalog", + "personal_schemas": "yes, use a schema based on the current user name during development" +}' + +init_bundle "lakeflow-pipelines" "295000fc-1ea8-4f43-befe-d5fb9f7d4ad4" '{ + "project_name": "lakeflow_pipelines_sql", + "default_catalog": "catalog", + "personal_schemas": "yes", + "language": "sql" +}' + + +init_bundle "lakeflow-pipelines" "87a174ba-60e4-4867-a140-1936bc9b00de" '{ + "project_name": "lakeflow_pipelines_python", + "default_catalog": "catalog", + "personal_schemas": "yes", + "language": "python" +}' + +cd contrib +( + init_bundle "templates/data-engineering" "e5f6g7h8-i9j0-1234-efgh-567890123456" '{ + "project_name": "data_engineering", + "default_catalog": "catalog", + "personal_schemas": "yes, use a schema based on the current user name during development" + }' +) +cd .. \ No newline at end of file