Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1555bf5
Fix unit tests failing due to time zone difference (#426)
dushyantk1509 Jan 3, 2026
93886d8
Fix stale snapshot detection to return 409 instead of 400 (#425)
cbb330 Jan 5, 2026
14bd9f0
OFD to only soft delete when data_manifest exists (#418)
jiang95-dev Jan 6, 2026
8207ff4
Refactor TableStatsCollectorUtil (#417)
srawat98-dev Jan 6, 2026
a88c055
Update to latest iceberg 1.5 version. (#428)
shanthoosh Jan 7, 2026
84741ed
Add cache to globStatus in OFD (#427)
jiang95-dev Jan 8, 2026
0f038c5
Add arg for OFD to run delete in parallel (#430)
teamurko Jan 8, 2026
f7b0e8b
Honor previous metadata maximum versions configuration defined by the…
shanthoosh Jan 8, 2026
25518b7
Update iceberg to the latest version. (#431)
shanthoosh Jan 9, 2026
b2d7d0e
Implementation[OpenhouseCommitEventTablePartitionStats]: Implement pa…
srawat98-dev Jan 14, 2026
f9be042
Add way for truststore to be configurable in baseapp statemanager (#433)
Will-Lo Jan 15, 2026
05c8a5d
Add 600s max histogram bucket for catalog_metadata_retrieval_latency …
cbb330 Jan 15, 2026
d12c1ec
Optimize build: Enable Gradle build cache for faster incremental buil…
cbb330 Jan 15, 2026
6e6195a
Optimize build: share OpenAPI generator JAR across client modules (#419)
cbb330 Jan 15, 2026
1feddec
Optimize build: Fix parallel build issues with port conflicts and imp…
cbb330 Jan 15, 2026
0cfcb77
Refactor table policy updates to use a class that can be extended (#435)
Will-Lo Jan 16, 2026
1b9bbe7
Add granular task level maintenance job metrics (#434)
kamanavishnu Jan 16, 2026
40c58a2
Optimize build: Enable parallel build by default and fix deprecation …
cbb330 Jan 18, 2026
0016163
Add certificate-based authentication for MySQL (#437)
dushyantk1509 Jan 23, 2026
6249a3d
Optimize build: Remove shadowJar from build task (#423)
cbb330 Jan 26, 2026
b5a0deb
Add Gradle docker tasks for streamlined local development (#438)
cbb330 Jan 26, 2026
581b704
Add distributed data loader project and core interfaces (#440)
robreeves Feb 4, 2026
d85f81f
Fix thread-safety issue in OpenHouseSparkITest for parallel test exec…
dushyantk1509 Feb 5, 2026
9b1313e
Use openhouse.dataloader namespace and correct PyIceberg import (#444)
ShreyeshArangath Feb 5, 2026
62ea0b2
Update GitHub link to point to documentation (#443)
conncui Feb 7, 2026
25ced3b
Add support for Trino query ID in commit metadata application ID (#442)
srawat98-dev Feb 7, 2026
d65db1d
Add dataloader into the GH issue templates (#448)
ShreyeshArangath Feb 10, 2026
32d76a9
[Feature][Dataloader] Support reading FileScanTask using ArrowScan (…
ShreyeshArangath Feb 13, 2026
2ce7320
Make tables client policy fetching extensible (#452)
Will-Lo Feb 13, 2026
1b47f09
Add support for PyPI package publish (#446)
ShreyeshArangath Feb 13, 2026
c6d517e
Add HDFS diagnostic FileIO and tables logging profile
Feb 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ body:
required: false
- label: "`Spark Client Integration`: This is the Apache Spark integration for OpenHouse catalog. `:integration:spark`"
required: false
- label: "`Python DataLoader`: This is the distributed data loading library for OpenHouse tables. `:integrations:python:dataloader`"
required: false
- label: "`Documentation`: This is the documentation for OpenHouse. `docs`"
required: false
- label: "`Local Docker`: This is the local Docker environment for OpenHouse. `infra/recipes/docker-compose`"
Expand Down
2 changes: 2 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_request_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ body:
required: false
- label: "`Spark Client Integration`: This is the Apache Spark integration for OpenHouse catalog. `:integration:spark`"
required: false
- label: "`Python DataLoader`: This is the distributed data loading library for OpenHouse tables. `:integrations:python:dataloader`"
required: false
- label: "`Documentation`: This is the documentation for OpenHouse. `docs`"
required: false
- label: "`Local Docker`: This is the local Docker environment for OpenHouse. `infra/recipes/docker-compose`"
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/build-run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.x'
python-version: '3.12'

- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true

- name: Run Data Loader Tests
working-directory: integrations/python/dataloader
run: make sync verify

- name: Install dependencies
run: pip install -r scripts/python/requirements.txt
Expand Down
47 changes: 46 additions & 1 deletion .github/workflows/build-tag-publish.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
name: Gradle Build, Tag, and Publish OpenHouse
name: Build, Tag, and Publish OpenHouse

on:
push:
branches:
- main

permissions:
contents: write # For git tagging

jobs:
build-and-run-tests:
uses: ./.github/workflows/build-run-tests.yml
Expand All @@ -13,6 +16,8 @@ jobs:
name: Build tagged commit
runs-on: ubuntu-latest
needs: build-and-run-tests
outputs:
semVer: ${{ steps.get_tag.outputs.semVer }}
steps:
- name: Checkout project sources
uses: actions/checkout@v6
Expand Down Expand Up @@ -75,3 +80,43 @@ jobs:
run: |
docker build -t linkedin-openhouse-docker.jfrog.io/linkedin/openhouse/jobs-scheduler:${{ steps.get_tag.outputs.semVer }} -f jobs-scheduler.Dockerfile .
docker push linkedin-openhouse-docker.jfrog.io/linkedin/openhouse/jobs-scheduler:${{ steps.get_tag.outputs.semVer }}

build-publish-python:
name: Build and Publish Python
runs-on: ubuntu-latest
needs: tag-publish-gradle
steps:
- name: Checkout project sources
uses: actions/checkout@v6

- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
python-version: '3.12'

- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true

- name: Sync dependencies
working-directory: integrations/python/dataloader
run: make sync

- name: Build package
working-directory: integrations/python/dataloader
env:
SETUPTOOLS_SCM_PRETEND_VERSION: ${{ needs.tag-publish-gradle.outputs.semVer }}
run: make build

- name: Validate package
working-directory: integrations/python/dataloader
run: make package-check

- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: integrations/python/dataloader/dist/
user: ${{ secrets.JFROG_PYPI_USERNAME }}
password: ${{ secrets.JFROG_PYPI_PASSWORD }}
verbose: true
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ hs_err_pid*
*.iws
.idea

# VS Code
.vscode/

# LinkedIn / Gradle / Hardware
.cache
.gradle
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
</a>
</div>
<div align="center">
<a href="https://github.com/linkedin/openhouse">
<a href="https://linkedin.github.io/openhouse/">
<img src="https://img.shields.io/badge/github-%23121011.svg?logo=github&logoColor=white" alt="GitHub">
</a>
<a href="https://join.slack.com/t/openhouse-bap9266/shared_invite/zt-2bsi0t8pi-wUOeDvQr8j8d5yl3X8WQJQ">
Expand Down Expand Up @@ -64,8 +64,19 @@ To build OpenHouse, you can use the following command:

### Running OpenHouse with Docker Compose

To run OpenHouse, we recommend the [SETUP](SETUP.md) guide. You would bring up all the OpenHouse services, MySQL,
Prometheus, Apache Spark and HDFS.
The quickest way to run OpenHouse locally:
```bash
# Build JARs, Docker images, and start containers (default: oh-hadoop-spark recipe)
./gradlew dockerUp

# Or use a lighter recipe for faster startup
./gradlew dockerUp -Precipe=oh-only

# Stop containers
./gradlew dockerDown
```

For detailed configuration options and testing instructions, see the [SETUP](SETUP.md) guide.

### Deploying OpenHouse to Kubernetes

Expand Down
62 changes: 50 additions & 12 deletions SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,60 @@

Use this guide to setup local development environment for OpenHouse using docker-compose.

## Build Containers
## Quick Start (Recommended)

The simplest way to build and run OpenHouse locally:

```bash
# Build everything and start containers (uses oh-hadoop-spark recipe by default)
./gradlew dockerUp

# Or choose a specific recipe
./gradlew dockerUp -Precipe=oh-only # Lightweight, local filesystem
./gradlew dockerUp -Precipe=oh-hadoop # With HDFS
./gradlew dockerUp -Precipe=oh-hadoop-spark # Full stack with Spark (default)

# Stop and remove containers
./gradlew dockerDown -Precipe=oh-only
```

This single command:
1. Builds all required JAR files (service bootJars, Spark runtime uber JARs)
2. Builds Docker images
3. Starts all containers in detached mode

**Requirements:**
- Java 17 (`export JAVA_HOME=$(/usr/libexec/java_home -v 17)` on macOS)
- Docker and Docker Compose

### Available Gradle Docker Tasks

| Task | Description |
|------|-------------|
| `./gradlew dockerPrereqs` | Build only the JAR files required by Docker images |
| `./gradlew dockerBuild -Precipe=<recipe>` | Build JARs and Docker images |
| `./gradlew dockerUp -Precipe=<recipe>` | Build everything and start containers |
| `./gradlew dockerDown -Precipe=<recipe>` | Stop and remove containers |

## Available Recipes

Recipes for setting up OpenHouse in local docker are available [here](infra/recipes/docker-compose)

docker-compose.yml files are provided to build multiple container Docker applications to be able to run a fully functional
OpenHouse locally on laptop. Script has been tested to work fine on MacOS.
| Config | Recipe | Notes |
|--------|--------|-------|
| Run OpenHouse Services Only | `oh-only` | Stores data on local filesystem within the application container, with in-memory database. Least resource consuming. |
| Run OpenHouse Services on HDFS | `oh-hadoop` | Stores data on locally running Hadoop HDFS containers, with iceberg-backed database. |
| Run OpenHouse Services on HDFS with Spark | `oh-hadoop-spark` | Stores data on locally running Hadoop HDFS containers, with MySQL database. Spark available for end to end testing. Most resource consuming. Starts Livy server. |

## Manual Docker Compose (Advanced)

Multiple recipes are provided for locally bringing up a docker-compose environment that can be used for testing.
If you prefer manual control over the build process:

Config| docker-compose Directory |Notes
---|--------------------------|---
Run OpenHouse Services Only | oh-only | Stores data on local filesystem within the application container, with in-memory database. Least resource consuming.
Run OpenHouse Services on HDFS | oh-hadoop | Stores data on locally running Hadoop HDFS containers, with iceberg-backed database.
Run OpenHouse Services on HDFS. Also, available Spark | oh-hadoop-spark | Stores data on locally running Hadoop HDFS containers, with MySQL database. Spark available for end to end testing. Most resource consuming. Spark container might need more memory at time. Starts Livy server.
### Build Containers

Before building docker images, you would need to build the openhouse project by running the following command.
Before building docker images, build the openhouse project:
```
./gradlew clean build
./gradlew build
```

Pick a config that suits your testing needs. `cd` into the respective docker-compose directory above. And run the following command to build all the necessary containers:
Expand All @@ -43,7 +79,7 @@ you can remove them by running
docker rmi $(docker images -f "dangling=true" -q)
```

## Run Containers
### Run Containers Manually

Choose a recipe that you want to run. `cd` into the respective docker-compose directory above. And run the following
command to start running all the containers.
Expand All @@ -64,6 +100,8 @@ To bring down the containers,
docker compose down
```

> **Note:** The `./gradlew dockerUp` command handles all of this automatically.

## Container Exposed Ports

Following ports can be useful while interacting from host machine with applications running in docker-compose environment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public Optional<HistoryConfig> getTableHistory(TableMetadata tableMetadata) {
return getTableHistory(response);
}

private Optional<RetentionConfig> getTableRetention(GetTableResponseBody response) {
protected Optional<RetentionConfig> getTableRetention(GetTableResponseBody response) {
// timePartitionSpec or retention.ColumnPattern should be present to run Retention job on a
// table.
if (response == null
Expand Down Expand Up @@ -104,7 +104,7 @@ private Optional<RetentionConfig> getTableRetention(GetTableResponseBody respons
.build());
}

private Optional<HistoryConfig> getTableHistory(GetTableResponseBody response) {
protected Optional<HistoryConfig> getTableHistory(GetTableResponseBody response) {
if (response == null
|| response.getPolicies() == null
|| response.getPolicies().getHistory() == null) {
Expand All @@ -129,7 +129,7 @@ private Optional<HistoryConfig> getTableHistory(GetTableResponseBody response) {
.build());
}

private Optional<List<ReplicationConfig>> getTableReplication(GetTableResponseBody response) {
protected Optional<List<ReplicationConfig>> getTableReplication(GetTableResponseBody response) {
// At least one replication config must be present
if (response == null
|| response.getPolicies() == null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
/**
* A callable class to apply an operation to some entity (table/database) by running a Spark job.
* Takes care of the job lifecycle using /jobs API.
*
* <p>NOTE: Every implementation must implement a static {@code OPERATION_TYPE} field in order for
* the job scheduler to load the OperationTask.
*/
@Slf4j
@Getter
Expand Down Expand Up @@ -270,6 +273,20 @@ private void reportJobState(
AppConstants.JOB_DURATION,
System.currentTimeMillis() - startTime,
attributes);

// Granular attributes to publish entity level job metrics
Attributes granularAttributes =
Attributes.of(
AttributeKey.stringKey(AppConstants.ENTITY_NAME),
metadata.getEntityName(),
AttributeKey.stringKey(AppConstants.ENTITY_TYPE),
metadata.getClass().getSimpleName().replace("Metadata", ""),
AttributeKey.stringKey(AppConstants.JOB_TYPE),
getType().getValue(),
AttributeKey.stringKey(AppConstants.JOB_STATE),
state.name());

otelEmitter.count(METRICS_SCOPE, "maintenance_job_completed", 1, granularAttributes);
}

protected abstract boolean launchJob();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
import com.linkedin.openhouse.jobs.client.TablesClient;
import com.linkedin.openhouse.jobs.client.model.JobConf;
import com.linkedin.openhouse.jobs.scheduler.JobsScheduler;
import com.linkedin.openhouse.jobs.util.AppConstants;
import com.linkedin.openhouse.jobs.util.DataLayoutUtil;
import com.linkedin.openhouse.jobs.util.DatabaseMetadata;
import com.linkedin.openhouse.jobs.util.DirectoryMetadata;
import com.linkedin.openhouse.jobs.util.Metadata;
import com.linkedin.openhouse.jobs.util.TableDataLayoutMetadata;
import com.linkedin.openhouse.jobs.util.TableMetadata;
import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.common.Attributes;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
Expand Down Expand Up @@ -168,6 +171,15 @@ private List<OperationTask<?>> processMetadataList(
if (optionalOperationTask.isPresent()) {
taskList.add(optionalOperationTask.get());
}

// Publish entity metrics for triggered tasks
Attributes taskAttributes =
Attributes.of(
AttributeKey.stringKey(AppConstants.ENTITY_NAME), metadata.getEntityName(),
AttributeKey.stringKey(AppConstants.ENTITY_TYPE),
metadata.getClass().getSimpleName().replace("Metadata", ""),
AttributeKey.stringKey(AppConstants.JOB_TYPE), jobType.getValue());
otelEmitter.count(METRICS_SCOPE, "maintenance_job_triggered", 1, taskAttributes);
}
return taskList;
}
Expand All @@ -183,6 +195,15 @@ public Optional<OperationTask<?>> processMetadata(
task.setOtelEmitter(otelEmitter);
if (!task.shouldRun()) {
log.info("Skipping task {}", task);

// Publish entity metrics for skipped tasks
Attributes taskAttributes =
Attributes.of(
AttributeKey.stringKey(AppConstants.ENTITY_NAME), metadata.getEntityName(),
AttributeKey.stringKey(AppConstants.ENTITY_TYPE),
metadata.getClass().getSimpleName().replace("Metadata", ""),
AttributeKey.stringKey(AppConstants.JOB_TYPE), task.getType().getValue());
otelEmitter.count(METRICS_SCOPE, "maintenance_job_skipped", 1, taskAttributes);
return Optional.empty();
} else {
if (OperationMode.SUBMIT.equals(operationMode)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,30 @@ protected static CommandLine createCommandLine(String[] args, List<Option> extra
}
}

protected static StateManager createStateManager(
CommandLine cmdLine, OtelEmitter otelEmitter, String trustStoreLocation) {
return new StateManager(
RetryUtil.getJobsStateApiRetryTemplate(),
createJobApiClient(cmdLine.getOptionValue("storageURL"), otelEmitter, trustStoreLocation));
}

protected static StateManager createStateManager(CommandLine cmdLine, OtelEmitter otelEmitter) {
return new StateManager(
RetryUtil.getJobsStateApiRetryTemplate(),
createJobApiClient(cmdLine.getOptionValue("storageURL"), otelEmitter));
createJobApiClient(cmdLine.getOptionValue("storageURL"), otelEmitter, null));
}

protected static String getJobId(CommandLine cmdLine) {
return cmdLine.getOptionValue("jobId");
}

private static JobApi createJobApiClient(String basePath, OtelEmitter otelEmitter) {
protected static JobApi createJobApiClient(
String basePath, OtelEmitter otelEmitter, String trustStoreLocation) {
ApiClient client = null;
try {
client = HousetablesApiClientFactory.getInstance().createApiClient(basePath, null, null);
client =
HousetablesApiClientFactory.getInstance()
.createApiClient(basePath, null, trustStoreLocation);
} catch (MalformedURLException | SSLException e) {
log.error("Jobs Api client creation failed: Failure while initializing ApiClient", e);
otelEmitter.count(
Expand Down
Loading