diff --git a/.gitignore b/.gitignore index 0cc7f06..2856868 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,19 @@ data/lightcurves_new_sectors/ candidates_new_sectors.json results/new_sectors/ +# Phase 2 bulk data (too large for git, re-downloadable) +data/phase2/ +results/phase2/ +!results/phase2/ANALYSIS_LOG.md +!results/phase2/*.png + +# Phase 2 FFI data (re-downloadable) +data/phase2/ffi_sector_*/ +data/phase2/qlp_sector_*/ +data/phase2/qlp_test/ +data/phase2/multisector/ +data/phase2/multisector_test/ + # Temporary analysis scripts python/tls_remaining.py diff --git a/CLAUDE.md b/CLAUDE.md index 24b11de..00c2106 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,13 +20,26 @@ exoplanet-hunter/ │ ├── io.rs # CSV parsing, file discovery │ └── main.rs # CLI: search, validate, crossmatch subcommands ├── python/ -│ ├── download_lightcurves.py # Fetch data from MAST/NASA +│ ├── download_lightcurves.py # Phase 1: Fetch TOI data from MAST/NASA +│ ├── download_sector_bulk.py # Phase 2: Bulk sector download (all stars) +│ ├── download_ffi_tesscut.py # Phase 2 FFI: TESScut pixel-level extraction +│ ├── download_qlp_bulk.py # Phase 2 FFI: QLP HLSP bulk download +│ ├── stack_multisector.py # Phase 2 FFI: Multi-sector stacking +│ ├── flag_discoveries.py # Phase 2: Flag new planet candidates │ ├── analyze_candidates.py # Phase-fold plots, cross-matching, reports -│ └── validate_candidates.py # 5-test false positive validation pipeline +│ ├── validate_candidates.py # 5-test false positive validation pipeline +│ ├── run_triceratops.py # TRICERATOPS FPP calculation +│ └── toi210_full_sectors.py # Multi-sector secondary eclipse analysis ├── data/ -│ └── lightcurves/ # Downloaded CSV light curves +│ ├── lightcurves/ # Phase 1: TOI light curves +│ ├── phase2/sector_N/ # Phase 2: SPOC bulk sector data (gitignored) +│ ├── phase2/ffi_sector_N/ # Phase 2: TESScut FFI light curves +│ ├── phase2/qlp_sector_N/ # Phase 2: QLP HLSP light curves +│ └── phase2/multisector/ # Phase 2: Multi-sector stacked light curves ├── results/ │ ├── plots/ # Phase-folded light curve PNGs +│ ├── deep_analysis/ # TRICERATOPS, multi-sector results +│ ├── phase2/ # Phase 2 results (gitignored) │ ├── REPORT.md # Analysis report │ ├── VALIDATION_REPORT.md # Scored validation results │ ├── validation_results.json # Detailed per-candidate validation @@ -38,7 +51,9 @@ exoplanet-hunter/ ├── tests/ │ ├── conftest.py # Shared Python test fixtures │ ├── test_validate_candidates.py # Python validation tests -│ └── test_analyze_candidates.py # Python analysis tests +│ ├── test_analyze_candidates.py # Python analysis tests +│ ├── test_flag_discoveries.py # Phase 2: Discovery flagging tests +│ └── test_download_sector_bulk.py # Phase 2: Bulk download tests ├── candidates.json # BLS output (intermediate) ├── Makefile # Build & run automation + `make test` ├── pytest.ini # Python test configuration @@ -48,12 +63,12 @@ exoplanet-hunter/ ## Testing -Run all tests (98 total: 66 Rust + 32 Python): +Run all tests (124 total: 70 Rust + 54 Python): ```bash make test # Run everything -cargo test # Rust only (66 tests: bls, validate, crossmatch, io) -python3.11 -m pytest tests/ -v # Python only (32 tests) +cargo test # Rust only (67 tests + 3 doc-tests) +python3.11 -m pytest tests/ -v # Python only (54 tests) ``` ### Rust test modules: @@ -67,8 +82,10 @@ python3.11 -m pytest tests/ -v # Python only (32 tests) - `tests/test_validate_candidates.py` — Validation functions, scoring (24 tests) - `tests/test_analyze_candidates.py` — Plotting, cross-matching, binning (8 tests) +- `tests/test_flag_discoveries.py` — Discovery flagging, TIC extraction, classification (18 tests) +- `tests/test_download_sector_bulk.py` — TOI fetch, deduplication, error handling (4 tests) -## Pipeline Steps +## Pipeline Steps (Phase 1: TOI Validation) ### Step 1: Download light curves @@ -122,6 +139,93 @@ Both run the same 5 false positive tests. The Rust version runs tests in paralle ./target/release/hunt crossmatch -i candidates.json -c data/lightcurves/confirmed_exoplanets.csv -o results/crossmatch_results.csv ``` +## Pipeline Steps (Phase 2: New Planet Discovery) + +Phase 2 downloads ALL stars from a TESS sector (not just TOIs), filters out known TOIs, +and runs BLS on the remaining unstudied stars. Any transit detection on a non-TOI star +is potentially a new planet candidate. + +### Quick start (one command): + +```bash +make phase2 PHASE2_SECTOR=56 PHASE2_LIMIT=1000 +``` + +### Step-by-step: + +```bash +# Step 1: Download non-TOI stars from a sector +python3.11 python/download_sector_bulk.py --sector 56 --limit 1000 +# Add --fgk-only for FGK dwarf stars (higher planet yield) +# Add --author QLP for FFI light curves (~10x more stars) + +# Step 2: Run BLS +./target/release/hunt search -i data/phase2/sector_56 \ + -o results/phase2/candidates_s56.json --snr-threshold 6.0 --n-periods 15000 + +# Step 3: Validate +./target/release/hunt validate -i results/phase2/candidates_s56.json \ + -l data/phase2/sector_56 -o results/phase2/ + +# Step 4: Flag discoveries (cross-ref TOI + cTOI + confirmed catalogs) +python3.11 python/flag_discoveries.py \ + --input results/phase2/candidates_s56.json \ + --validation results/phase2/validation_results.json --min-score 60 +``` + +### Key differences from Phase 1: + +- Downloads ALL sector stars via `astroquery.mast`, not just ExoFOP TOIs +- Filters out known TOIs/cTOIs so only unstudied stars are searched +- Period agreement test (Test 4) is skipped for non-TOIs (no reference period) +- Discovery flagging cross-refs against 3 catalogs (TOI, cTOI, confirmed) +- Planet-sized candidates (Rp/Rs < 0.3) are flagged for ExoFOP cTOI submission + +### Sector selection strategy: + +- Recent sectors (56+) have 200-second FFI cadence — better sensitivity +- Higher-numbered sectors have had less community scrutiny +- Use QLP author for FFI light curves (~160k stars/sector vs ~15k for SPOC) + +### Phase 2 FFI: Three Independent Approaches + +SPOC 2-minute targets have already been transit-searched by TESS TPS. To find NEW +planets, we need Full Frame Image (FFI) data — 200k+ stars per sector never searched. + +**Approach 1: TESScut** (`download_ffi_tesscut.py`) + +- Downloads pixel cutouts from TESS FFI, performs aperture photometry +- Works for ALL sectors including 56+ (200-second cadence) +- Requires RA/Dec coordinate input +- Best for: targeted searches around specific fields + +```bash +python3.11 python/download_ffi_tesscut.py --sector 40 --ra 90.0 --dec -66.5 --limit 100 +``` + +**Approach 2: QLP Bulk** (`download_qlp_bulk.py`) + +- Downloads pre-extracted QLP HLSP light curves from MAST +- CRITICAL: Must query with `obs_collection="HLSP"` (not "TESS") +- lightkurve supports `author="QLP"` and `author="TESS-SPOC"` directly +- Best for: bulk scanning of many FFI stars per sector + +```bash +python3.11 python/download_qlp_bulk.py --sector 56 --limit 500 +``` + +**Approach 3: Multi-sector Stacking** (`stack_multisector.py`) + +- Combines light curves from multiple sectors for the same star +- Pushes below SPOC's 7.1-sigma threshold with longer baselines +- Best for: long-period planets (P > 15d) and marginal single-sector signals + +```bash +python3.11 python/stack_multisector.py --tic-ids 14179859 --author QLP +# Or stack top candidates from a previous run: +python3.11 python/stack_multisector.py --from-json results/phase2/candidates_s56.json --min-snr 5.0 +``` + ## Validation Pipeline (Rust: `validate.rs` / Python: `validate_candidates.py`) This is the scientific rigor layer. Based on standard methods from: diff --git a/Makefile b/Makefile index 0d65947..637ee6c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all setup download hunt analyze clean viral test test-rust test-python +.PHONY: all setup download hunt analyze clean viral test test-rust test-python phase2 phase2-download phase2-download-fgk phase2-hunt phase2-validate phase2-flag phase2-ffi phase2-qlp phase2-stack # ============================================================================ # 🔭 Exoplanet Hunter — Makefile @@ -55,6 +55,88 @@ analyze: --crossmatch \ --top-n $(TOP_N) +# ============================================================================ +# 🔭 Phase 2: New Planet Discovery +# ============================================================================ + +PHASE2_SECTOR ?= 56 +PHASE2_LIMIT ?= 1000 +PHASE2_AUTHOR ?= SPOC + +# Download all stars in a sector (filtering out known TOIs) +phase2-download: + @echo "🔭 Phase 2: Downloading non-TOI stars from sector $(PHASE2_SECTOR)..." + python3.11 python/download_sector_bulk.py \ + --sector $(PHASE2_SECTOR) --limit $(PHASE2_LIMIT) \ + --author $(PHASE2_AUTHOR) + +# Download FGK dwarfs only (higher planet yield, slower download) +phase2-download-fgk: + @echo "🔭 Phase 2: Downloading FGK dwarfs from sector $(PHASE2_SECTOR)..." + python3.11 python/download_sector_bulk.py \ + --sector $(PHASE2_SECTOR) --limit $(PHASE2_LIMIT) \ + --author $(PHASE2_AUTHOR) --fgk-only + +# Run BLS on Phase 2 data +phase2-hunt: target/release/hunt + @echo "🔭 Phase 2: Running BLS on sector $(PHASE2_SECTOR) non-TOI stars..." + mkdir -p results/phase2 + ./target/release/hunt search \ + -i data/phase2/sector_$(PHASE2_SECTOR) \ + -o results/phase2/candidates_s$(PHASE2_SECTOR).json \ + --snr-threshold $(SNR) --n-periods 15000 + +# Validate Phase 2 candidates +phase2-validate: target/release/hunt + @echo "🔬 Phase 2: Validating candidates..." + ./target/release/hunt validate \ + -i results/phase2/candidates_s$(PHASE2_SECTOR).json \ + -l data/phase2/sector_$(PHASE2_SECTOR) \ + -o results/phase2 + +# Flag new discoveries +phase2-flag: + @echo "🏴 Phase 2: Flagging new discoveries..." + python3.11 python/flag_discoveries.py \ + --input results/phase2/candidates_s$(PHASE2_SECTOR).json \ + --validation results/phase2/validation_results.json \ + --min-score 60 + +# Full Phase 2 pipeline: download → hunt → validate → flag +phase2: phase2-download phase2-hunt phase2-validate phase2-flag + @echo "" + @echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + @echo "🔭 Phase 2 complete! Check results/phase2/" + @echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +# ============================================================================ +# 🔭 Phase 2 FFI: Three Independent Approaches +# ============================================================================ + +PHASE2_RA ?= 90.0 +PHASE2_DEC ?= -66.5 +PHASE2_RADIUS ?= 0.5 + +# Approach 1: TESScut FFI extraction (works all sectors, pixel-level photometry) +phase2-ffi: + @echo "🔭 Phase 2 FFI: TESScut extraction — Sector $(PHASE2_SECTOR)..." + python3.11 python/download_ffi_tesscut.py \ + --sector $(PHASE2_SECTOR) --limit $(PHASE2_LIMIT) \ + --ra $(PHASE2_RA) --dec $(PHASE2_DEC) --radius $(PHASE2_RADIUS) + +# Approach 2: QLP bulk download (160k+ stars/sector, pre-extracted light curves) +phase2-qlp: + @echo "🔭 Phase 2 FFI: QLP bulk download — Sector $(PHASE2_SECTOR)..." + python3.11 python/download_qlp_bulk.py \ + --sector $(PHASE2_SECTOR) --limit $(PHASE2_LIMIT) + +# Approach 3: Multi-sector stacking (push below SPOC's SNR threshold) +phase2-stack: + @echo "🔭 Phase 2: Multi-sector stacking..." + python3.11 python/stack_multisector.py \ + --sector $(PHASE2_SECTOR) --top-n $(PHASE2_LIMIT) \ + --author $(PHASE2_AUTHOR) + # Quick hunt: aggressive settings for maximum candidates aggressive: ./target/release/hunt \ diff --git a/python/download_ffi_tesscut.py b/python/download_ffi_tesscut.py new file mode 100644 index 0000000..77d299d --- /dev/null +++ b/python/download_ffi_tesscut.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3.11 +"""Phase 2 FFI Approach 1: Extract light curves from TESS Full Frame Images via TESScut. + +TESScut (astroquery.mast.Tesscut) downloads pixel-level cutouts from TESS FFI data. +We perform simple aperture photometry to extract a light curve. This accesses ~200k+ +stars per sector that were NEVER on the 2-minute target list and were NEVER searched +by SPOC TPS — the best source for genuine new planet discoveries. + +Advantages over eleanor: + - Works for ALL sectors (including 56+ with 200-second cadence) + - No tensorflow/numpy version issues + - Built into astroquery (already a dependency) + +Pipeline: + 1. Query TIC catalog for stars in a sector (by coordinate cone search or bulk TIC query) + 2. Filter out SPOC 2-minute targets and known TOIs + 3. For each star: download TESScut cutout → aperture photometry → normalize → save CSV + 4. Output format matches Phase 1/2: time, flux, flux_err + +Usage: + python3.11 python/download_ffi_tesscut.py --sector 40 --limit 100 + python3.11 python/download_ffi_tesscut.py --sector 40 --limit 500 --fgk-only + python3.11 python/download_ffi_tesscut.py --sector 40 --ra 90.0 --dec -66.5 --radius 1.0 +""" + +from __future__ import annotations + +import argparse +import json +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +warnings.filterwarnings("ignore") + + +def get_ffi_targets_from_tic( + ra: float, dec: float, radius: float, sector: int +) -> list[dict]: + """Query TIC catalog for stars near a coordinate that fall on TESS silicon. + + Args: + ra: Right ascension of cone center (degrees) + dec: Declination of cone center (degrees) + radius: Cone search radius (degrees) + sector: TESS sector (used to verify on-silicon) + + Returns list of dicts with tic_id, ra, dec, Tmag. + """ + from astroquery.mast import Catalogs + + print(f" Querying TIC catalog: RA={ra:.2f}, Dec={dec:.2f}, r={radius:.1f} deg...") + result = Catalogs.query_region( + f"{ra} {dec}", + radius=radius, + catalog="TIC", + ) + + targets = [] + for row in result: + tmag = row.get("Tmag") + if tmag is None or np.ma.is_masked(tmag) or float(tmag) > 14: + continue # Too faint for FFI photometry + if tmag is not None and float(tmag) < 4: + continue # Too bright (saturated) + + targets.append({ + "tic_id": int(row["ID"]), + "ra": float(row["ra"]), + "dec": float(row["dec"]), + "Tmag": float(tmag), + }) + + print(f" TIC catalog: {len(targets)} stars with Tmag 4-14") + return targets + + +def get_spoc_target_ids(sector: int) -> set[int]: + """Get TIC IDs of stars already on the SPOC 2-minute target list. + + These have already been transit-searched by SPOC TPS — we want to SKIP them. + """ + from astroquery.mast import Observations + + print(f" Querying SPOC 2-min targets for sector {sector}...") + obs = Observations.query_criteria( + obs_collection="TESS", + sequence_number=sector, + provenance_name="SPOC", + dataproduct_type="timeseries", + ) + + spoc_tics = set() + for row in obs: + name = str(row["target_name"]).strip() + if name.isdigit(): + spoc_tics.add(int(name)) + + print(f" SPOC 2-min targets: {len(spoc_tics)}") + return spoc_tics + + +def extract_lightcurve_from_cutout(tic_id: int, ra: float, dec: float, + sector: int, cutout_size: int = 11 + ) -> tuple[np.ndarray, np.ndarray, np.ndarray] | None: + """Download a TESScut cutout and perform simple aperture photometry. + + Uses a 3x3 pixel aperture centered on the target. Background is estimated + from the outer ring of the cutout. + + Returns (time, flux, flux_err) or None on failure. + """ + from astropy.coordinates import SkyCoord + import astropy.units as u + from astroquery.mast import Tesscut + + try: + coord = SkyCoord(ra=ra, dec=dec, unit="deg") + cutout = Tesscut.get_cutouts( + coordinates=coord, + size=cutout_size, + sector=sector, + ) + + if not cutout: + return None + + hdu = cutout[0] + data = hdu[1].data + + time = data["TIME"] + flux_cube = data["FLUX"] + flux_err_cube = data["FLUX_ERR"] + + # Simple aperture photometry: sum central 3x3 pixels + center = cutout_size // 2 + ap_slice = (slice(None), slice(center - 1, center + 2), slice(center - 1, center + 2)) + raw_flux = np.nansum(flux_cube[ap_slice], axis=(1, 2)) + raw_err = np.sqrt(np.nansum(flux_err_cube[ap_slice] ** 2, axis=(1, 2))) + + # Background: median of outer ring + mask = np.ones(flux_cube.shape[1:], dtype=bool) + mask[2:-2, 2:-2] = False + bkg_per_pixel = np.nanmedian(flux_cube[:, mask], axis=1) + n_ap_pixels = 9 # 3x3 + raw_flux -= bkg_per_pixel * n_ap_pixels + + # Remove bad cadences + good = np.isfinite(time) & np.isfinite(raw_flux) & (raw_flux > 0) + time = time[good] + raw_flux = raw_flux[good] + raw_err = raw_err[good] + + if len(time) < 100: + return None + + # Normalize + median_flux = np.median(raw_flux) + norm_flux = raw_flux / median_flux + norm_err = raw_err / median_flux + + # Remove 5-sigma outliers + med = np.median(norm_flux) + mad = np.median(np.abs(norm_flux - med)) * 1.4826 + keep = np.abs(norm_flux - med) < 5 * mad + time = time[keep] + norm_flux = norm_flux[keep] + norm_err = norm_err[keep] + + if len(time) < 100: + return None + + return time, norm_flux, norm_err + + except Exception: + return None + + +def download_ffi_lightcurves( + targets: list[dict], + sector: int, + output_dir: Path, +) -> tuple[int, int, int]: + """Extract FFI light curves for a list of targets via TESScut. + + Returns (downloaded, skipped_existing, failed). + """ + downloaded = 0 + skipped = 0 + failed = 0 + + for t in tqdm(targets, desc=" TESScut extraction"): + tic_id = t["tic_id"] + filename = f"TIC_{tic_id}_s{sector:04d}_ffi.csv" + filepath = output_dir / filename + + if filepath.exists(): + skipped += 1 + continue + + result = extract_lightcurve_from_cutout( + tic_id, t["ra"], t["dec"], sector + ) + + if result is None: + failed += 1 + continue + + time, flux, flux_err = result + df = pd.DataFrame({ + "time": time, + "flux": flux, + "flux_err": flux_err, + }) + df.to_csv(filepath, index=False) + downloaded += 1 + + return downloaded, skipped, failed + + +def main(): + parser = argparse.ArgumentParser( + description="Phase 2 FFI: Extract light curves from TESS FFI via TESScut" + ) + parser.add_argument("--sector", type=int, required=True, help="TESS sector number") + parser.add_argument("--limit", type=int, default=100, help="Max stars to process") + parser.add_argument("--ra", type=float, default=None, help="Center RA (degrees)") + parser.add_argument("--dec", type=float, default=None, help="Center Dec (degrees)") + parser.add_argument("--radius", type=float, default=0.5, help="Cone search radius (degrees)") + parser.add_argument("--output", default=None, help="Output directory") + parser.add_argument("--fgk-only", action="store_true", help="Filter for FGK dwarfs") + parser.add_argument("--cutout-size", type=int, default=11, help="TESScut cutout size (pixels)") + args = parser.parse_args() + + output_dir = Path(args.output) if args.output else Path(f"data/phase2/ffi_sector_{args.sector}") + output_dir.mkdir(parents=True, exist_ok=True) + + # Default pointing: use a known bright field in the sector + # User should provide RA/Dec for targeted searches + if args.ra is None or args.dec is None: + print("ERROR: --ra and --dec required (center of field to search)") + print(" Tip: Use TESS-Point or the TESS viewing tool to find coordinates") + print(" Example: --ra 90.0 --dec -66.5 (TESS CVZ south)") + sys.exit(1) + + print("=" * 60) + print(f"PHASE 2 FFI: TESScut Light Curve Extraction — Sector {args.sector}") + print(f" Center: RA={args.ra:.4f}, Dec={args.dec:.4f}") + print(f" Radius: {args.radius} deg") + print(f" Limit: {args.limit}") + print(f" FGK filter: {args.fgk_only}") + print(f" Output: {output_dir}") + print("=" * 60) + + # Step 1: Get targets from TIC catalog + print(f"\n[1/5] Querying TIC catalog for field stars...") + targets = get_ffi_targets_from_tic(args.ra, args.dec, args.radius, args.sector) + + if not targets: + print("ERROR: No TIC targets found in this field.") + sys.exit(1) + + # Step 2: Remove SPOC 2-minute targets (already searched by TESS pipeline) + print(f"\n[2/5] Removing SPOC 2-minute targets (already transit-searched)...") + spoc_tics = get_spoc_target_ids(args.sector) + targets = [t for t in targets if t["tic_id"] not in spoc_tics] + print(f" After removing SPOC targets: {len(targets)}") + + # Step 3: Remove known TOIs + print(f"\n[3/5] Removing known TOIs and cTOIs...") + sys.path.insert(0, str(Path(__file__).parent)) + from download_sector_bulk import fetch_toi_tic_ids + known_tics = fetch_toi_tic_ids() + targets = [t for t in targets if t["tic_id"] not in known_tics] + print(f" After removing TOIs/cTOIs: {len(targets)}") + + # Step 4: Optional FGK filter + if args.fgk_only: + print(f"\n[4/5] Filtering for FGK dwarfs...") + from download_sector_bulk import filter_fgk_dwarfs + fgk_ids = set(filter_fgk_dwarfs([t["tic_id"] for t in targets])) + targets = [t for t in targets if t["tic_id"] in fgk_ids] + else: + print(f"\n[4/5] Skipping FGK filter") + + # Sort by brightness (brightest first = best SNR) + targets.sort(key=lambda t: t["Tmag"]) + targets = targets[:args.limit] + print(f"\n Final target count: {len(targets)} (sorted by Tmag)") + + # Save manifest + manifest = output_dir / "targets_ffi.json" + with open(manifest, "w") as f: + json.dump({ + "sector": args.sector, + "method": "TESScut", + "ra_center": args.ra, + "dec_center": args.dec, + "radius_deg": args.radius, + "fgk_only": args.fgk_only, + "n_targets": len(targets), + "tic_ids": [t["tic_id"] for t in targets], + }, f, indent=2) + + # Step 5: Extract light curves + print(f"\n[5/5] Extracting FFI light curves via TESScut...") + downloaded, skipped, failed = download_ffi_lightcurves( + targets, args.sector, output_dir + ) + + print(f"\n{'=' * 60}") + print(f" FFI EXTRACTION COMPLETE") + print(f" Extracted: {downloaded}") + print(f" Already had: {skipped}") + print(f" Failed: {failed}") + print(f" Output: {output_dir}") + print(f"{'=' * 60}") + + if downloaded + skipped > 0: + print(f"\nNext: Run BLS on FFI light curves:") + print(f" ./target/release/hunt search -i {output_dir} -o results/phase2/candidates_ffi_s{args.sector}.json --snr-threshold 5.5 --n-periods 15000") + + +if __name__ == "__main__": + main() diff --git a/python/download_qlp_bulk.py b/python/download_qlp_bulk.py new file mode 100644 index 0000000..0b4c51f --- /dev/null +++ b/python/download_qlp_bulk.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3.11 +"""Phase 2 FFI Approach 2: Download QLP (Quick Look Pipeline) light curves from MAST. + +QLP extracts light curves from TESS Full Frame Images for ~160k-millions of targets +per sector. These stars were NOT on the 2-minute SPOC target list and were NOT +transit-searched by SPOC TPS — making them prime targets for new planet discovery. + +CRITICAL: QLP light curves are HLSP products. You MUST query with: + obs_collection="HLSP" (NOT "TESS") + provenance_name="QLP" + +This was why earlier attempts returned 0 results. + +QLP FITS files contain KSPSAP_FLUX (detrended flux) — the best column for transit search. +TESS-SPOC FFI products use PDCSAP_FLUX instead. + +Usage: + python3.11 python/download_qlp_bulk.py --sector 56 --limit 500 + python3.11 python/download_qlp_bulk.py --sector 56 --limit 2000 --fgk-only + python3.11 python/download_qlp_bulk.py --sector 56 --limit 500 --author TESS-SPOC +""" + +from __future__ import annotations + +import argparse +import json +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +warnings.filterwarnings("ignore") + + +def query_qlp_targets(sector: int, author: str = "QLP") -> list[dict]: + """Query MAST for QLP/TESS-SPOC FFI targets in a sector. + + CRITICAL: Use obs_collection="HLSP" for QLP and TESS-SPOC FFI products. + Using "TESS" returns 0 results for these HLSP pipelines. + + NOTE: The full HLSP sector query can be very slow (100k+ results). + If it times out, falls back to querying TESS targets in the sector + and checking QLP availability per-star via lightkurve. + + Returns list of dicts with tic_id. + """ + from astroquery.mast import Observations + + # First try the direct HLSP query (fastest if it works) + print(f" Querying MAST HLSP for sector {sector} ({author})...") + print(f" (This may take a while for QLP — 100k+ targets per sector)") + try: + obs = Observations.query_criteria( + obs_collection="HLSP", + sequence_number=sector, + provenance_name=author, + dataproduct_type="timeseries", + ) + + targets = [] + seen = set() + for row in obs: + name = str(row["target_name"]).strip() + tic_id = None + if name.isdigit(): + tic_id = int(name) + elif "TIC" in name.upper(): + import re + m = re.search(r'(\d{6,})', name) + if m: + tic_id = int(m.group(1)) + + if tic_id and tic_id not in seen: + seen.add(tic_id) + targets.append({"tic_id": tic_id}) + + print(f" Found {len(targets)} unique {author} targets in sector {sector}") + return targets + + except Exception as e: + print(f" HLSP bulk query failed or timed out: {e}") + print(f" Falling back to TESS sector query + per-star QLP check...") + return _fallback_query_qlp(sector, author) + + +def _fallback_query_qlp(sector: int, author: str) -> list[dict]: + """Fallback: get all TESS targets in sector, then check QLP availability. + + This is slower per-star but avoids the massive HLSP sector query. + Gets stars from the TESS obs_collection (which includes SPOC targets), + then we check each via lightkurve for QLP availability. + """ + from astroquery.mast import Observations + + print(f" Querying TESS targets in sector {sector}...") + obs = Observations.query_criteria( + obs_collection="TESS", + sequence_number=sector, + dataproduct_type="timeseries", + ) + + tic_ids = set() + for row in obs: + name = str(row["target_name"]).strip() + if name.isdigit(): + tic_ids.add(int(name)) + + print(f" Found {len(tic_ids)} TESS targets — these will be checked for {author} data") + return [{"tic_id": t} for t in sorted(tic_ids)] + + +def download_qlp_lightcurve( + tic_id: int, sector: int, author: str, output_dir: Path +) -> bool: + """Download a single QLP/TESS-SPOC light curve and convert to CSV. + + Uses lightkurve with author="QLP" or "TESS-SPOC" which handles HLSP lookup. + Falls back to direct FITS download if lightkurve fails. + + Returns True on success, False on failure. + """ + import lightkurve as lk + + filename = f"TIC_{tic_id}_s{sector:04d}_qlp.csv" + filepath = output_dir / filename + + if filepath.exists(): + return True # Already have it + + try: + # lightkurve handles HLSP products correctly with author parameter + search = lk.search_lightcurve( + f"TIC {tic_id}", + mission="TESS", + author=author, + sector=sector, + ) + + if len(search) == 0: + return False + + lc = search[0].download(quality_bitmask="hardest") + if lc is None: + return False + + lc = lc.remove_nans().remove_outliers(sigma=5).normalize() + + if len(lc.time.value) < 100: + return False + + flux_err = ( + lc.flux_err.value + if lc.flux_err is not None + else np.full(len(lc.time.value), 0.001) + ) + + df = pd.DataFrame({ + "time": lc.time.value, + "flux": lc.flux.value, + "flux_err": flux_err, + }) + df.to_csv(filepath, index=False) + return True + + except Exception: + return False + + +def download_qlp_lightcurve_fits( + tic_id: int, sector: int, author: str, output_dir: Path +) -> bool: + """Fallback: download QLP FITS directly from MAST and extract flux. + + QLP uses KSPSAP_FLUX column. TESS-SPOC uses PDCSAP_FLUX. + """ + from astropy.io import fits + from astroquery.mast import Observations + + filename = f"TIC_{tic_id}_s{sector:04d}_qlp.csv" + filepath = output_dir / filename + + if filepath.exists(): + return True + + try: + obs = Observations.query_criteria( + obs_collection="HLSP", + sequence_number=sector, + provenance_name=author, + target_name=str(tic_id), + dataproduct_type="timeseries", + ) + + if len(obs) == 0: + return False + + products = Observations.get_product_list(obs[0]) + lc_products = products[products["productSubGroupDescription"] == "LLC"] + if len(lc_products) == 0: + # Try any FITS product + lc_products = products[ + [".fits" in str(p).lower() for p in products["productFilename"]] + ] + if len(lc_products) == 0: + return False + + manifest = Observations.download_products( + lc_products[0], download_dir=str(output_dir / "_fits_cache") + ) + fits_path = manifest["Local Path"][0] + + with fits.open(fits_path) as hdul: + data = hdul[1].data + time = data["TIME"] + + # QLP uses KSPSAP_FLUX, TESS-SPOC uses PDCSAP_FLUX + flux_col = "KSPSAP_FLUX" if author == "QLP" else "PDCSAP_FLUX" + if flux_col not in data.columns.names: + # Fallback to SAP_FLUX + flux_col = "SAP_FLUX" + flux = data[flux_col] + + err_col = flux_col.replace("FLUX", "FLUX_ERR") + if err_col in data.columns.names: + flux_err = data[err_col] + else: + flux_err = np.full_like(flux, 0.001) + + # Clean and normalize + good = np.isfinite(time) & np.isfinite(flux) & (flux > 0) + time, flux, flux_err = time[good], flux[good], flux_err[good] + + if len(time) < 100: + return False + + median_flux = np.median(flux) + norm_flux = flux / median_flux + norm_err = flux_err / median_flux + + # Remove 5-sigma outliers + med = np.median(norm_flux) + mad = np.median(np.abs(norm_flux - med)) * 1.4826 + keep = np.abs(norm_flux - med) < 5 * mad + + df = pd.DataFrame({ + "time": time[keep], + "flux": norm_flux[keep], + "flux_err": norm_err[keep], + }) + df.to_csv(filepath, index=False) + return True + + except Exception: + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Phase 2 FFI: Download QLP/TESS-SPOC bulk light curves from MAST" + ) + parser.add_argument("--sector", type=int, required=True, help="TESS sector number") + parser.add_argument("--limit", type=int, default=500, help="Max stars to download") + parser.add_argument( + "--author", default="QLP", choices=["QLP", "TESS-SPOC"], + help="HLSP pipeline (QLP=detrended FFI, TESS-SPOC=PDC FFI)" + ) + parser.add_argument("--output", default=None, help="Output directory") + parser.add_argument("--fgk-only", action="store_true", help="Filter for FGK dwarfs") + parser.add_argument( + "--use-fits-fallback", action="store_true", + help="Use direct FITS download instead of lightkurve (slower but more reliable)" + ) + args = parser.parse_args() + + output_dir = Path(args.output) if args.output else Path(f"data/phase2/qlp_sector_{args.sector}") + output_dir.mkdir(parents=True, exist_ok=True) + + print("=" * 60) + print(f"PHASE 2 FFI: QLP/TESS-SPOC Bulk Download — Sector {args.sector}") + print(f" Author: {args.author} (obs_collection=HLSP)") + print(f" Limit: {args.limit}") + print(f" FGK filter: {args.fgk_only}") + print(f" Output: {output_dir}") + print("=" * 60) + + # Step 1: Query all QLP targets in sector + print(f"\n[1/4] Querying MAST for {args.author} targets...") + targets = query_qlp_targets(args.sector, args.author) + + if not targets: + print(f"ERROR: No {args.author} targets found for sector {args.sector}.") + print(" Note: QLP coverage varies by sector. Try a different sector.") + sys.exit(1) + + tic_ids = [t["tic_id"] for t in targets] + print(f" Total {args.author} targets: {len(tic_ids)}") + + # Step 2: Remove known TOIs + print(f"\n[2/4] Removing known TOIs and cTOIs...") + sys.path.insert(0, str(Path(__file__).parent)) + from download_sector_bulk import fetch_toi_tic_ids + known_tics = fetch_toi_tic_ids() + tic_ids_filtered = [t for t in tic_ids if t not in known_tics] + print(f" Removed {len(tic_ids) - len(tic_ids_filtered)} known TOIs/cTOIs") + print(f" Remaining: {len(tic_ids_filtered)}") + + # Also remove SPOC 2-minute targets (already transit-searched) + from download_ffi_tesscut import get_spoc_target_ids + spoc_tics = get_spoc_target_ids(args.sector) + before = len(tic_ids_filtered) + tic_ids_filtered = [t for t in tic_ids_filtered if t not in spoc_tics] + print(f" Removed {before - len(tic_ids_filtered)} SPOC 2-min targets") + print(f" Non-SPOC, non-TOI targets: {len(tic_ids_filtered)}") + + # Step 3: Optional FGK filter + if args.fgk_only: + print(f"\n[3/4] Filtering for FGK dwarfs...") + from download_sector_bulk import filter_fgk_dwarfs + tic_ids_filtered = filter_fgk_dwarfs(tic_ids_filtered) + else: + print(f"\n[3/4] Skipping FGK filter") + + tic_ids_filtered = tic_ids_filtered[:args.limit] + print(f"\n Final target count: {len(tic_ids_filtered)}") + + # Save manifest + manifest = output_dir / "targets_qlp.json" + with open(manifest, "w") as f: + json.dump({ + "sector": args.sector, + "author": args.author, + "method": "QLP_HLSP", + "obs_collection": "HLSP", + "fgk_only": args.fgk_only, + "n_targets": len(tic_ids_filtered), + "tic_ids": tic_ids_filtered, + }, f, indent=2) + + # Step 4: Download light curves + print(f"\n[4/4] Downloading {len(tic_ids_filtered)} {args.author} light curves...") + downloaded = 0 + skipped = 0 + failed = 0 + + download_fn = download_qlp_lightcurve_fits if args.use_fits_fallback else download_qlp_lightcurve + + for tic_id in tqdm(tic_ids_filtered, desc=f" {args.author} download"): + filepath = output_dir / f"TIC_{tic_id}_s{args.sector:04d}_qlp.csv" + if filepath.exists(): + skipped += 1 + continue + + success = download_fn(tic_id, args.sector, args.author, output_dir) + if success: + downloaded += 1 + else: + failed += 1 + + print(f"\n{'=' * 60}") + print(f" QLP DOWNLOAD COMPLETE") + print(f" Downloaded: {downloaded}") + print(f" Already had: {skipped}") + print(f" Failed: {failed}") + print(f" Output: {output_dir}") + print(f"{'=' * 60}") + + if downloaded + skipped > 0: + print(f"\nNext: Run BLS on QLP light curves:") + print(f" ./target/release/hunt search -i {output_dir} -o results/phase2/candidates_qlp_s{args.sector}.json --snr-threshold 5.5 --n-periods 15000") + + +if __name__ == "__main__": + main() diff --git a/python/download_sector_bulk.py b/python/download_sector_bulk.py new file mode 100644 index 0000000..7a2d8b1 --- /dev/null +++ b/python/download_sector_bulk.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3.11 +"""Phase 2: Download ALL light curves from a TESS sector for new planet discovery. + +Unlike Phase 1 (which only downloads known TOIs), this script downloads light +curves for ALL observed stars in a sector, then filters out known TOIs so we +can run BLS on unstudied stars. Any transit detection on a non-TOI star is +potentially a NEW planet candidate. + +Strategy: + 1. Query MAST for all light curves in a sector (SPOC or QLP) + 2. Download the ExoFOP TOI list and community TOI (cTOI) list + 3. Filter out stars with existing TOI/cTOI designations + 4. Optionally filter for FGK dwarfs (Tmag 8-13) using TIC catalog + 5. Download light curves in parallel batches with resume support + 6. Save as CSV (time, flux, flux_err) — same format as Phase 1 + +Usage: + python3.11 python/download_sector_bulk.py --sector 56 --limit 1000 + python3.11 python/download_sector_bulk.py --sector 70 --limit 5000 --author QLP + python3.11 python/download_sector_bulk.py --sector 56 --limit 2000 --fgk-only +""" + +from __future__ import annotations + +import argparse +import json +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +warnings.filterwarnings("ignore") + + +def fetch_toi_tic_ids() -> set[int]: + """Download TOI + cTOI lists from ExoFOP and return all known TIC IDs.""" + known_tics = set() + + # TOI list + try: + toi_url = "https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv" + tois = pd.read_csv(toi_url, comment="#") + known_tics.update(tois["TIC ID"].dropna().astype(int).unique()) + print(f" TOI catalog: {len(tois)} entries, {len(known_tics)} unique TIC IDs") + except Exception as e: + print(f" WARNING: Could not fetch TOI list: {e}") + + # Community TOI list + try: + ctoi_url = "https://exofop.ipac.caltech.edu/tess/download_ctoi.php?sort=ctoi&output=csv" + ctois = pd.read_csv(ctoi_url, comment="#") + ctoi_tics = set(ctois["TIC ID"].dropna().astype(int).unique()) + known_tics.update(ctoi_tics) + print(f" cTOI catalog: {len(ctois)} entries, {len(ctoi_tics)} unique TIC IDs") + except Exception as e: + print(f" WARNING: Could not fetch cTOI list: {e}") + + return known_tics + + +def query_sector_targets(sector: int, author: str = "SPOC") -> list[dict]: + """Query MAST for all light curve targets in a sector. + + Returns list of dicts with 'target_name' and 'obsid' fields. + """ + from astroquery.mast import Observations + + print(f" Querying MAST for sector {sector} ({author})...") + obs = Observations.query_criteria( + obs_collection="TESS", + sequence_number=sector, + provenance_name=author, + dataproduct_type="timeseries", + ) + + targets = [] + seen = set() + for row in obs: + name = str(row["target_name"]).strip() + if name.isdigit() and name not in seen: + seen.add(name) + targets.append({ + "tic_id": int(name), + "obsid": str(row["obsid"]), + }) + + print(f" Found {len(targets)} unique targets in sector {sector}") + return targets + + +def filter_fgk_dwarfs(tic_ids: list[int], batch_size: int = 500) -> list[int]: + """Filter TIC IDs to FGK dwarf stars (best for transit detection). + + Criteria: Tmag 8-13, logg > 4.0, Teff 3500-7000 K + """ + from astroquery.mast import Catalogs + + print(f" Filtering {len(tic_ids)} stars for FGK dwarfs...") + good = [] + + for i in tqdm(range(0, len(tic_ids), batch_size), desc=" TIC query"): + batch = tic_ids[i:i + batch_size] + try: + result = Catalogs.query_criteria( + catalog="TIC", + ID=batch, + ) + for row in result: + tmag = row.get("Tmag") + logg = row.get("logg") + teff = row.get("Teff") + + if (tmag is not None and 8 < float(tmag) < 13 + and logg is not None and float(logg) > 4.0 + and teff is not None and 3500 < float(teff) < 7000): + good.append(int(row["ID"])) + except Exception as e: + print(f" WARNING: TIC batch query failed: {e}") + # On failure, include the batch unfiltered + good.extend(batch) + + print(f" FGK dwarfs: {len(good)} / {len(tic_ids)}") + return good + + +def download_lightcurves( + tic_ids: list[int], + sector: int, + output_dir: Path, + author: str = "SPOC", +) -> tuple[int, int, int]: + """Download light curves for a list of TIC IDs. + + Returns (downloaded, skipped_existing, failed). + """ + import lightkurve as lk + + downloaded = 0 + skipped = 0 + failed = 0 + + for tic_id in tqdm(tic_ids, desc=" Downloading"): + filename = f"TIC_{tic_id}_s{sector:04d}.csv" + filepath = output_dir / filename + + if filepath.exists(): + skipped += 1 + continue + + try: + search = lk.search_lightcurve( + f"TIC {tic_id}", + mission="TESS", + author=author, + sector=sector, + ) + if len(search) == 0: + failed += 1 + continue + + lc = search[0].download(quality_bitmask="hardest") + if lc is None: + failed += 1 + continue + + lc = lc.remove_nans().remove_outliers(sigma=5).normalize() + + if len(lc.time.value) < 100: + failed += 1 + continue + + flux_err = ( + lc.flux_err.value + if lc.flux_err is not None + else np.full(len(lc.time.value), 0.001) + ) + + df = pd.DataFrame({ + "time": lc.time.value, + "flux": lc.flux.value, + "flux_err": flux_err, + }) + df.to_csv(filepath, index=False) + downloaded += 1 + + except Exception: + failed += 1 + continue + + return downloaded, skipped, failed + + +def main(): + parser = argparse.ArgumentParser( + description="Phase 2: Bulk sector download for new planet discovery" + ) + parser.add_argument("--sector", type=int, required=True, help="TESS sector number") + parser.add_argument("--limit", type=int, default=1000, help="Max stars to download") + parser.add_argument( + "--author", default="SPOC", choices=["SPOC", "QLP", "TESS-SPOC"], + help="Light curve pipeline (SPOC=2min, QLP=FFI, TESS-SPOC=FFI)" + ) + parser.add_argument( + "--output", default=None, + help="Output directory (default: data/phase2/sector_N)" + ) + parser.add_argument( + "--fgk-only", action="store_true", + help="Filter for FGK dwarf stars (Tmag 8-13, logg>4, Teff 3500-7000K)" + ) + parser.add_argument( + "--include-tois", action="store_true", + help="Include known TOIs (default: exclude them for new discovery)" + ) + args = parser.parse_args() + + output_dir = Path(args.output) if args.output else Path(f"data/phase2/sector_{args.sector}") + output_dir.mkdir(parents=True, exist_ok=True) + + print("=" * 60) + print(f"PHASE 2: Bulk Sector Download — Sector {args.sector}") + print(f" Author: {args.author}") + print(f" Limit: {args.limit}") + print(f" FGK filter: {args.fgk_only}") + print(f" Include TOIs: {args.include_tois}") + print(f" Output: {output_dir}") + print("=" * 60) + + # Step 1: Query all targets in sector + print(f"\n[1/4] Querying MAST for sector {args.sector} targets...") + targets = query_sector_targets(args.sector, args.author) + + if not targets: + print("ERROR: No targets found. Check sector number and author.") + sys.exit(1) + + tic_ids = [t["tic_id"] for t in targets] + + # Step 2: Filter out known TOIs + if not args.include_tois: + print("\n[2/4] Filtering out known TOIs and cTOIs...") + known_tics = fetch_toi_tic_ids() + before = len(tic_ids) + tic_ids = [t for t in tic_ids if t not in known_tics] + print(f" Removed {before - len(tic_ids)} known TOI/cTOI stars") + print(f" Remaining non-TOI targets: {len(tic_ids)}") + else: + print("\n[2/4] Skipping TOI filter (--include-tois)") + + # Step 3: Optional FGK filter + if args.fgk_only: + print("\n[3/4] Filtering for FGK dwarf stars...") + tic_ids = filter_fgk_dwarfs(tic_ids) + else: + print("\n[3/4] Skipping FGK filter (use --fgk-only to enable)") + + # Apply limit + tic_ids = tic_ids[:args.limit] + print(f"\n Final target count: {len(tic_ids)}") + + # Save target list for reproducibility + manifest = output_dir / "targets.json" + with open(manifest, "w") as f: + json.dump({ + "sector": args.sector, + "author": args.author, + "fgk_only": args.fgk_only, + "include_tois": args.include_tois, + "n_targets": len(tic_ids), + "tic_ids": tic_ids, + }, f, indent=2) + print(f" Target list saved: {manifest}") + + # Step 4: Download + print(f"\n[4/4] Downloading {len(tic_ids)} light curves...") + downloaded, skipped, failed = download_lightcurves( + tic_ids, args.sector, output_dir, args.author + ) + + print(f"\n{'=' * 60}") + print(f" DOWNLOAD COMPLETE") + print(f" Downloaded: {downloaded}") + print(f" Already had: {skipped}") + print(f" Failed: {failed}") + print(f" Output: {output_dir}") + print(f"{'=' * 60}") + + if downloaded + skipped > 0: + print(f"\nNext steps:") + print(f" 1. Run BLS:") + print(f" ./target/release/hunt search -i {output_dir} -o results/phase2/candidates_s{args.sector}.json --snr-threshold 6.0 --n-periods 15000") + print(f" 2. Validate:") + print(f" ./target/release/hunt validate -i results/phase2/candidates_s{args.sector}.json -l {output_dir} -o results/phase2/") + print(f" 3. Flag discoveries:") + print(f" python3.11 python/flag_discoveries.py --input results/phase2/candidates_s{args.sector}.json --validation results/phase2/validation_results.json") + + +if __name__ == "__main__": + main() diff --git a/python/flag_discoveries.py b/python/flag_discoveries.py new file mode 100644 index 0000000..cdec0ef --- /dev/null +++ b/python/flag_discoveries.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3.11 +"""Phase 2: Flag potential new planet discoveries from BLS candidates. + +Cross-references BLS candidates against: + 1. ExoFOP TOI catalog (known TESS Objects of Interest) + 2. ExoFOP community TOI catalog (cTOIs) + 3. NASA Confirmed Exoplanet catalog + 4. Validation results (planet_score threshold) + +Outputs a ranked discovery report with candidates categorized as: + - NEW: Not in any catalog, passes validation — submit to ExoFOP as cTOI + - KNOWN_TOI: Already a TOI — skip + - KNOWN_PLANET: Already confirmed — skip + - LOW_SCORE: Below validation threshold — likely false positive + +Usage: + python3.11 python/flag_discoveries.py \ + --input results/phase2/candidates_s56.json \ + --validation results/phase2/validation_results.json \ + --min-score 60 +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +import warnings +from pathlib import Path + +import pandas as pd + +warnings.filterwarnings("ignore") + + +def extract_tic_id(filename: str) -> int | None: + """Extract TIC ID from a filename like 'TIC_261136679_s0056.csv'.""" + match = re.search(r"TIC[_\s]?(\d+)", filename, re.IGNORECASE) + if match: + return int(match.group(1)) + return None + + +def fetch_known_catalogs() -> tuple[set[int], set[int], set[int]]: + """Fetch TOI, cTOI, and confirmed planet TIC IDs. + + Returns (toi_tics, ctoi_tics, confirmed_tics). + """ + toi_tics = set() + ctoi_tics = set() + confirmed_tics = set() + + # TOIs + try: + toi_url = "https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv" + tois = pd.read_csv(toi_url, comment="#") + toi_tics = set(tois["TIC ID"].dropna().astype(int).unique()) + print(f" TOI catalog: {len(toi_tics)} unique TIC IDs") + except Exception as e: + print(f" WARNING: Could not fetch TOI catalog: {e}") + + # Community TOIs + try: + ctoi_url = "https://exofop.ipac.caltech.edu/tess/download_ctoi.php?sort=ctoi&output=csv" + ctois = pd.read_csv(ctoi_url, comment="#") + ctoi_tics = set(ctois["TIC ID"].dropna().astype(int).unique()) + print(f" cTOI catalog: {len(ctoi_tics)} unique TIC IDs") + except Exception as e: + print(f" WARNING: Could not fetch cTOI catalog: {e}") + + # Confirmed planets (from local file if available) + local_catalog = Path("data/lightcurves/confirmed_exoplanets.csv") + if local_catalog.exists(): + try: + confirmed = pd.read_csv(local_catalog) + if "tic_id" in confirmed.columns: + confirmed_tics = set(confirmed["tic_id"].dropna().astype(int).unique()) + print(f" Confirmed catalog: {len(confirmed_tics)} TIC IDs") + except Exception: + pass + + return toi_tics, ctoi_tics, confirmed_tics + + +def classify_candidate( + tic_id: int | None, + planet_score: int | None, + min_score: int, + toi_tics: set[int], + ctoi_tics: set[int], + confirmed_tics: set[int], +) -> str: + """Classify a candidate as NEW, KNOWN_TOI, KNOWN_PLANET, or LOW_SCORE.""" + if tic_id is not None: + if tic_id in confirmed_tics: + return "KNOWN_PLANET" + if tic_id in toi_tics or tic_id in ctoi_tics: + return "KNOWN_TOI" + + if planet_score is not None and planet_score < min_score: + return "LOW_SCORE" + + return "NEW" + + +def main(): + parser = argparse.ArgumentParser( + description="Phase 2: Flag new planet discoveries from BLS candidates" + ) + parser.add_argument( + "--input", required=True, + help="BLS candidates JSON from hunt search" + ) + parser.add_argument( + "--validation", default=None, + help="Validation results JSON from hunt validate" + ) + parser.add_argument( + "--min-score", type=int, default=60, + help="Minimum planet_score to consider (default: 60)" + ) + parser.add_argument( + "--output", default=None, + help="Output report path (default: same dir as input, discovery_report.json)" + ) + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"ERROR: {input_path} not found") + sys.exit(1) + + print("=" * 60) + print("PHASE 2: Discovery Flagging") + print("=" * 60) + + # Load BLS candidates + with open(input_path) as f: + hunt_report = json.load(f) + + candidates = hunt_report.get("candidates", []) + print(f"\nLoaded {len(candidates)} BLS candidates from {input_path}") + + # Load validation results if available + scores = {} + if args.validation: + val_path = Path(args.validation) + if val_path.exists(): + with open(val_path) as f: + val_results = json.load(f) + for v in val_results: + scores[v["filename"]] = v.get("planet_score", 0) + print(f"Loaded {len(scores)} validation scores from {val_path}") + + # Fetch known catalogs + print("\nFetching known catalogs...") + toi_tics, ctoi_tics, confirmed_tics = fetch_known_catalogs() + all_known = toi_tics | ctoi_tics | confirmed_tics + print(f" Total known TIC IDs: {len(all_known)}") + + # Classify each candidate + print(f"\nClassifying candidates (min_score={args.min_score})...") + results = [] + for c in candidates: + tic_id = extract_tic_id(c["filename"]) + planet_score = scores.get(c["filename"]) + status = classify_candidate( + tic_id, planet_score, args.min_score, + toi_tics, ctoi_tics, confirmed_tics, + ) + results.append({ + "filename": c["filename"], + "tic_id": tic_id, + "period_days": c["period_days"], + "depth_ppm": c["depth_ppm"], + "snr": c["snr"], + "radius_ratio": c["radius_ratio"], + "n_transits": c["n_transits"], + "planet_score": planet_score, + "status": status, + }) + + # Summary + new = [r for r in results if r["status"] == "NEW"] + known_toi = [r for r in results if r["status"] == "KNOWN_TOI"] + known_planet = [r for r in results if r["status"] == "KNOWN_PLANET"] + low_score = [r for r in results if r["status"] == "LOW_SCORE"] + + print(f"\n{'=' * 60}") + print(f" DISCOVERY REPORT") + print(f"{'=' * 60}") + print(f" Total candidates: {len(results)}") + print(f" NEW (potential): {len(new)}") + print(f" KNOWN_TOI: {len(known_toi)}") + print(f" KNOWN_PLANET: {len(known_planet)}") + print(f" LOW_SCORE: {len(low_score)}") + print(f"{'=' * 60}") + + if new: + # Sort by planet_score (descending), then SNR + new.sort(key=lambda r: (-(r["planet_score"] or 0), -r["snr"])) + print(f"\n TOP NEW CANDIDATES:") + print(f" {'TIC ID':>12} {'Period(d)':>10} {'SNR':>8} {'Depth(ppm)':>11} {'Rp/Rs':>8} {'Score':>6}") + print(f" {'─' * 62}") + for r in new[:20]: + tic = r["tic_id"] or "?" + score = r["planet_score"] if r["planet_score"] is not None else "?" + print(f" {tic:>12} {r['period_days']:>10.4f} {r['snr']:>8.1f} {r['depth_ppm']:>11.0f} {r['radius_ratio']:>8.4f} {score:>6}") + + # Flag planet-sized candidates specifically + planet_sized = [r for r in new if r["radius_ratio"] < 0.3] + if planet_sized: + print(f"\n PLANET-SIZED NEW CANDIDATES (Rp/Rs < 0.3):") + for r in planet_sized: + tic = r["tic_id"] or "?" + rp_earth = r["radius_ratio"] * 1.0 # approximate, need stellar radius + print(f" TIC {tic}: P={r['period_days']:.4f}d, SNR={r['snr']:.1f}, Rp/Rs={r['radius_ratio']:.4f}") + print(f"\n These are your best candidates for ExoFOP cTOI submission!") + else: + print(f"\n No planet-sized (Rp/Rs < 0.3) new candidates found.") + print(f" All detections have Rp/Rs > 0.3, likely eclipsing binaries.") + else: + print("\n No new candidates found in this batch.") + + # Save report + output_path = Path(args.output) if args.output else input_path.parent / "discovery_report.json" + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump({ + "summary": { + "total": len(results), + "new": len(new), + "known_toi": len(known_toi), + "known_planet": len(known_planet), + "low_score": len(low_score), + "min_score_threshold": args.min_score, + }, + "candidates": results, + }, f, indent=2) + print(f"\n Report saved: {output_path}") + + if new: + print(f"\n Next steps for NEW candidates:") + print(f" 1. Generate phase-fold plots for visual inspection") + print(f" 2. Run deep validation (TLS, TRICERATOPS) on top candidates") + print(f" 3. Check MAST/Simbad for known variables at these coordinates") + print(f" 4. Submit as cTOI to ExoFOP: https://exofop.ipac.caltech.edu/tess/") + + +if __name__ == "__main__": + main() diff --git a/python/stack_multisector.py b/python/stack_multisector.py new file mode 100644 index 0000000..a28eff1 --- /dev/null +++ b/python/stack_multisector.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3.11 +"""Phase 2 FFI Approach 3: Multi-sector stacking for sub-threshold planet detection. + +SPOC TPS has a detection threshold of ~7.1 sigma MES (Multiple Event Statistic). +Our BLS uses SNR >= 6.0. By stacking data from multiple TESS sectors for the same +star, we increase the number of transits observed and push the effective SNR higher. + +This approach finds: + - Long-period planets (P > 15d) that only transit 1-2 times per sector + - Shallow transits on faint stars that are individually below threshold + - Planets around stars in the TESS Continuous Viewing Zone (CVZ) with 13+ sectors + +Strategy: + 1. Pick a set of TIC IDs (from previous phase2 runs or a target list) + 2. For each star, find ALL available TESS sectors + 3. Download and stitch light curves from all sectors + 4. Save the combined time series for BLS analysis + 5. The longer baseline dramatically improves period determination and SNR + +Usage: + python3.11 python/stack_multisector.py --tic-ids 97168477 14179859 --author SPOC + python3.11 python/stack_multisector.py --from-json results/phase2/candidates_s56.json --min-snr 5.0 + python3.11 python/stack_multisector.py --sector 56 --top-n 50 --author QLP +""" + +from __future__ import annotations + +import argparse +import json +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +warnings.filterwarnings("ignore") + + +def find_all_sectors(tic_id: int, author: str = "SPOC") -> list[int]: + """Find all TESS sectors that observed a given TIC ID. + + Queries MAST for all observations of this target across all sectors. + For HLSP products (QLP, TESS-SPOC), uses obs_collection="HLSP". + """ + from astroquery.mast import Observations + + obs_collection = "HLSP" if author in ("QLP", "TESS-SPOC") else "TESS" + + try: + obs = Observations.query_criteria( + obs_collection=obs_collection, + provenance_name=author, + target_name=str(tic_id), + dataproduct_type="timeseries", + ) + + sectors = set() + for row in obs: + seq = row.get("sequence_number") + if seq is not None: + try: + sectors.add(int(seq)) + except (ValueError, TypeError): + pass + + return sorted(sectors) + except Exception: + return [] + + +def download_and_stitch( + tic_id: int, sectors: list[int], author: str +) -> tuple[np.ndarray, np.ndarray, np.ndarray, list[int]] | None: + """Download light curves from multiple sectors and stitch them together. + + Each sector's light curve is independently normalized to median=1.0 before + stitching. This removes sector-to-sector flux offsets from different + apertures, backgrounds, and CCD positions. + + Returns (time, flux, flux_err, sectors_used) or None on failure. + """ + import lightkurve as lk + + all_time = [] + all_flux = [] + all_err = [] + sectors_used = [] + + for sector in sectors: + try: + search = lk.search_lightcurve( + f"TIC {tic_id}", + mission="TESS", + author=author, + sector=sector, + ) + if len(search) == 0: + continue + + lc = search[0].download(quality_bitmask="hardest") + if lc is None: + continue + + lc = lc.remove_nans().remove_outliers(sigma=5).normalize() + + if len(lc.time.value) < 50: + continue + + flux_err = ( + lc.flux_err.value + if lc.flux_err is not None + else np.full(len(lc.time.value), 0.001) + ) + + all_time.append(lc.time.value) + all_flux.append(lc.flux.value) + all_err.append(flux_err) + sectors_used.append(sector) + + except Exception: + continue + + if not all_time: + return None + + time = np.concatenate(all_time) + flux = np.concatenate(all_flux) + flux_err = np.concatenate(all_err) + + # Sort by time + order = np.argsort(time) + time = time[order] + flux = flux[order] + flux_err = flux_err[order] + + if len(time) < 200: + return None + + return time, flux, flux_err, sectors_used + + +def get_candidates_from_json(json_path: str, min_snr: float = 0.0) -> list[int]: + """Extract TIC IDs from a BLS candidates JSON file. + + Optionally filter by minimum SNR to focus on marginal detections + that might become significant with more data. + """ + with open(json_path) as f: + candidates = json.load(f) + + tic_ids = [] + for c in candidates: + snr = c.get("snr", 0) + if snr >= min_snr: + filename = c.get("filename", "") + # Extract TIC ID from filename like "TIC_12345678_s0056.csv" + import re + m = re.search(r'TIC_(\d+)', filename) + if m: + tic_ids.append(int(m.group(1))) + + return tic_ids + + +def main(): + parser = argparse.ArgumentParser( + description="Phase 2 FFI: Multi-sector stacking for sub-threshold planet detection" + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--tic-ids", nargs="+", type=int, help="Specific TIC IDs to stack") + group.add_argument("--from-json", type=str, help="BLS candidates JSON file") + group.add_argument("--sector", type=int, help="Re-stack all candidates from a sector run") + + parser.add_argument("--min-snr", type=float, default=0.0, + help="Min SNR filter when using --from-json (default: all)") + parser.add_argument("--top-n", type=int, default=50, + help="Max targets to stack (default: 50)") + parser.add_argument( + "--author", default="SPOC", choices=["SPOC", "QLP", "TESS-SPOC"], + help="Light curve pipeline to use for download" + ) + parser.add_argument("--output", default=None, help="Output directory") + parser.add_argument("--min-sectors", type=int, default=2, + help="Minimum sectors required (default: 2)") + args = parser.parse_args() + + # Determine TIC IDs + if args.tic_ids: + tic_ids = args.tic_ids + elif args.from_json: + tic_ids = get_candidates_from_json(args.from_json, args.min_snr) + else: + # Load from sector run + json_path = f"results/phase2/candidates_s{args.sector}.json" + if not Path(json_path).exists(): + print(f"ERROR: {json_path} not found. Run phase2-hunt first.") + sys.exit(1) + tic_ids = get_candidates_from_json(json_path, args.min_snr) + + tic_ids = tic_ids[:args.top_n] + + output_dir = Path(args.output) if args.output else Path("data/phase2/multisector") + output_dir.mkdir(parents=True, exist_ok=True) + + print("=" * 60) + print(f"PHASE 2: Multi-Sector Stacking") + print(f" Targets: {len(tic_ids)}") + print(f" Author: {args.author}") + print(f" Min sectors: {args.min_sectors}") + print(f" Output: {output_dir}") + print("=" * 60) + + stacked = 0 + skipped_few_sectors = 0 + failed = 0 + results_log = [] + + for tic_id in tqdm(tic_ids, desc=" Multi-sector stacking"): + filename = f"TIC_{tic_id}_multisector.csv" + filepath = output_dir / filename + + if filepath.exists(): + stacked += 1 + continue + + # Find all available sectors + sectors = find_all_sectors(tic_id, args.author) + + if len(sectors) < args.min_sectors: + skipped_few_sectors += 1 + results_log.append({ + "tic_id": tic_id, + "status": "skipped", + "reason": f"only {len(sectors)} sectors (need {args.min_sectors})", + "sectors_available": sectors, + }) + continue + + # Download and stitch + result = download_and_stitch(tic_id, sectors, args.author) + + if result is None: + failed += 1 + results_log.append({ + "tic_id": tic_id, + "status": "failed", + "sectors_available": sectors, + }) + continue + + time, flux, flux_err, sectors_used = result + baseline_days = time[-1] - time[0] + + df = pd.DataFrame({ + "time": time, + "flux": flux, + "flux_err": flux_err, + }) + df.to_csv(filepath, index=False) + + stacked += 1 + results_log.append({ + "tic_id": tic_id, + "status": "stacked", + "sectors_used": sectors_used, + "n_sectors": len(sectors_used), + "n_points": len(time), + "baseline_days": round(baseline_days, 1), + }) + + tqdm.write( + f" TIC {tic_id}: {len(sectors_used)} sectors, " + f"{len(time)} pts, {baseline_days:.0f} days" + ) + + # Save stacking log + log_path = output_dir / "stacking_log.json" + with open(log_path, "w") as f: + json.dump(results_log, f, indent=2) + + print(f"\n{'=' * 60}") + print(f" MULTI-SECTOR STACKING COMPLETE") + print(f" Stacked: {stacked}") + print(f" Too few sectors: {skipped_few_sectors}") + print(f" Failed: {failed}") + print(f" Output: {output_dir}") + print(f" Log: {log_path}") + print(f"{'=' * 60}") + + # Summary of best targets + stacked_results = [r for r in results_log if r.get("status") == "stacked"] + if stacked_results: + stacked_results.sort(key=lambda r: r.get("n_sectors", 0), reverse=True) + print(f"\nTop stacked targets (by sector count):") + for r in stacked_results[:10]: + print( + f" TIC {r['tic_id']}: {r['n_sectors']} sectors, " + f"{r['n_points']} pts, {r['baseline_days']} days" + ) + + if stacked > 0: + print(f"\nNext: Run BLS on stacked light curves:") + print(f" ./target/release/hunt search -i {output_dir} -o results/phase2/candidates_multisector.json --snr-threshold 5.5 --n-periods 20000 --max-period 40.0") + print(f"\n NOTE: Use --max-period 40.0 to search for long-period planets") + print(f" that single-sector BLS would miss!") + + +if __name__ == "__main__": + main() diff --git a/results/phase2/ANALYSIS_LOG.md b/results/phase2/ANALYSIS_LOG.md new file mode 100644 index 0000000..2380016 --- /dev/null +++ b/results/phase2/ANALYSIS_LOG.md @@ -0,0 +1,167 @@ +# Phase 2 Analysis Log — Sector 56 + +## Run Parameters +- **Date:** 2026-03-29 +- **Sector:** 56 +- **Author:** SPOC (2-min cadence) +- **Stars downloaded:** 883 / 1000 targets (117 failed downloads) +- **TOI filter:** Excluded all known TOIs and cTOIs +- **FGK filter:** Off (all spectral types) + +## Pipeline Results + +| Stage | Count | +|-------|-------| +| Light curves downloaded | 883 | +| BLS detections (SNR >= 6.0) | 828 | +| Validated (score >= 70) | 274 | +| Validated (score >= 60) | 517 | +| Flagged NEW (not in TOI/cTOI/confirmed) | 434 | +| Planet-sized NEW (Rp/Rs < 0.3) | 1 | + +## Candidate Investigation: TIC 97168477 + +### BLS Detection +- Period: 12.3322 d +- Depth: 76,648 ppm +- SNR: 6.5 +- Rp/Rs: 0.2769 +- N transits: 2 +- Planet score: 80/100 + +### Stellar Parameters (TIC Catalog) +- Teff: 5383 K (G/K dwarf) +- R*: 0.909 R_sun +- M*: 0.934 M_sun +- logg: 4.49 (main-sequence dwarf) +- Tmag: 11.7 +- Distance: 237 pc +- Luminosity class: DWARF + +### Derived Planet Parameters (if real) +- Rp = 2.45 R_Jupiter = 27.5 R_Earth (suspiciously large) +- Semi-major axis: 0.102 AU +- T_eq ~ 709 K + +### Deep Analysis Verdict: FALSE POSITIVE + +**Evidence:** +1. Phase-folded light curve shows NO transit dip. In-transit median flux (1.000051) is actually *higher* than out-of-transit (0.999997). Measured depth = -54 ppm (negative). +2. Multi-sector data (sectors 56 + 83, 759 days, 136,883 points) shows no periodic signal at P=12.33d. +3. Only 2 "transits" in sector 56's 27.9-day baseline — insufficient for reliable detection. +4. Deepest flux point (0.9731) is a 4.4-sigma outlier, not a transit. +5. BLS detected scatter/noise pattern, not a physical transit. +6. 3.0-sigma secondary eclipse hint at phase 0.5 — consistent with systematic noise, not a self-luminous companion. + +**Plots:** `TIC_97168477_phase_fold.png`, `TIC_97168477_sector56_detail.png` + +## Conclusions + +### Sector 56 SPOC Results +- **No genuine new planet candidates found** in 883 non-TOI SPOC targets +- All 434 NEW detections have Rp/Rs > 0.3 (eclipsing binaries) except TIC 97168477 +- TIC 97168477 (Rp/Rs=0.277) is confirmed false positive upon deep analysis +- This is expected: SPOC 2-minute targets are a curated list where the TESS pipeline already searched for planets + +### Next Steps +1. **Try QLP data** — 10x more stars per sector from Full Frame Images, many never searched +2. **Target recent sectors (70+)** with less community scrutiny +3. **Use --fgk-only filter** to focus on FGK dwarfs (best planet hosts) +4. **Lower SNR threshold cautiously** (5.5 instead of 6.0) for marginal but real signals + +--- + +## Run 2: Sector 70, FGK Dwarfs (Partial — 203 stars) + +### Parameters +- **Date:** 2026-03-29 +- **Sector:** 70 (chosen for lowest TOI coverage: 1.2%) +- **Author:** SPOC +- **FGK filter:** ON (Tmag 8-13, logg>4, Teff 3500-7000K) +- **Stars downloaded:** 203 / 500 target (still downloading) + +### Results + +| Stage | Count | +|-------|-------| +| Light curves | 203 | +| BLS detections | 176 | +| Validated (score >= 70) | 45 | +| Flagged NEW | 88 | +| Planet-sized NEW (Rp/Rs < 0.3) | 0 | + +Smallest Rp/Rs among NEW candidates: TIC 14179859 (Rp/Rs=0.385, P=10.28d, SNR=9.7) + +### Conclusion + +Same pattern as sector 56: no planet-sized candidates among SPOC non-TOI targets. All detections are eclipsing binaries (Rp/Rs > 0.3). The FGK filter didn't change the outcome — the fundamental issue is that SPOC targets have already been searched by the TESS pipeline. + +## Strategic Assessment + +### Why SPOC non-TOIs don't yield new planets + +The TESS SPOC pipeline runs its own transit search (using the Transiting Planet Search module, TPS) on every 2-minute cadence target. Any planet candidate it finds becomes a Threshold Crossing Event (TCE), goes through Data Validation (DV), and if it passes, becomes a TOI. + +**Stars in the SPOC target list that are NOT TOIs have already been searched and cleared by SPOC.** Our BLS is finding the same eclipsing binaries that SPOC found and correctly rejected. + +### What would actually find new planets + +1. **Full Frame Image (FFI) data** — 200k+ stars per sector that were NOT on the 2-minute target list and were NOT searched by SPOC TPS. These stars only have FFI photometry (10-min or 200-sec cadence). QLP extracts light curves but doesn't run a transit search. + +2. **Problem:** QLP/TESS-SPOC FFI light curves are not available as individual timeseries products on MAST through astroquery. They may need to be downloaded as bulk FITS files from the MAST archive directly. + +3. **Alternative approach:** Use `eleanor` Python package to extract light curves directly from FFI cutouts for specific TIC IDs. This bypasses the MAST product catalog entirely. + +4. **Another alternative:** Download pre-computed QLP light curves from the bulk download portal at https://archive.stsci.edu/hlsp/qlp + +### Recommended next steps + +- Investigate bulk QLP download from MAST HLSP portal +- Try `eleanor` for on-demand FFI extraction +- Or: accept that laptop-based citizen science may not compete with SPOC on 2-min targets, and focus on: + - Multi-sector stacking for marginal detections below SPOC's threshold + - Long-period planets that SPOC misses (P > 15 days, single transit events) + - Unusual transit shapes that automated pipelines reject + +--- + +## Phase 2 FFI: Three Independent Approaches (Built & Tested) + +### Date: 2026-03-29 + +### Key Discovery: QLP via obs_collection="HLSP" + +The reason previous QLP queries returned 0 results: we used `obs_collection="TESS"`. +QLP and TESS-SPOC are HLSP products — must query with `obs_collection="HLSP"`. +Once fixed: **1,037,873 QLP targets in sector 40 alone.** + +### Approach 1: TESScut FFI Extraction (`download_ffi_tesscut.py`) +- Uses `astroquery.mast.Tesscut` for pixel-level cutouts +- 3x3 aperture photometry with background subtraction +- Works for all sectors including 56+ (200-second cadence) +- Requires RA/Dec input (targeted field search) +- Status: **Built, needs real-data test** + +### Approach 2: QLP Bulk Download (`download_qlp_bulk.py`) +- Downloads pre-extracted QLP HLSP light curves via lightkurve +- `lightkurve.search_lightcurve(author="QLP")` works perfectly +- Sector 40 test: 5/5 downloads successful, 3738 pts each +- BLS on QLP data: all 5 detected (all EBs in dense field — expected for test) +- Status: **Built and tested end-to-end** + +### Approach 3: Multi-sector Stacking (`stack_multisector.py`) +- Stitches light curves from multiple sectors per star +- TIC 14179859 test: 5 QLP sectors, 26,665 pts, 811.8 day baseline +- BLS on stacked data: SNR=59.3 at P=25.97d (EB, but detection works) +- Status: **Built and tested end-to-end** + +### eleanor Status: ABANDONED +- eleanor 2.0.5 requires tensorflow, numpy<2.0, and has broken photutils imports +- TESScut provides the same FFI extraction capability without dependency hell +- For sectors 1-55, TESScut is equivalent; for 56+, TESScut is better + +### Next: Scale Up +1. Run QLP bulk on sector 40 with 500+ non-TOI, non-SPOC FGK dwarfs +2. Filter for Rp/Rs < 0.3 candidates (planet-sized) +3. Cross-validate any hits with TESScut (independent photometry) and multi-sector stacking +4. Deep analysis on any planet-sized candidates diff --git a/results/phase2/TIC_97168477_phase_fold.png b/results/phase2/TIC_97168477_phase_fold.png new file mode 100644 index 0000000..5a29c1e Binary files /dev/null and b/results/phase2/TIC_97168477_phase_fold.png differ diff --git a/results/phase2/TIC_97168477_sector56_detail.png b/results/phase2/TIC_97168477_sector56_detail.png new file mode 100644 index 0000000..74cd3a5 Binary files /dev/null and b/results/phase2/TIC_97168477_sector56_detail.png differ diff --git a/tests/test_download_sector_bulk.py b/tests/test_download_sector_bulk.py new file mode 100644 index 0000000..a3ccf14 --- /dev/null +++ b/tests/test_download_sector_bulk.py @@ -0,0 +1,61 @@ +"""Tests for the Phase 2 bulk sector download pipeline.""" + +import json +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest + +# Add project root to path +sys.path.insert(0, str(Path(__file__).parent.parent / "python")) +from download_sector_bulk import fetch_toi_tic_ids + + +# =========================================================================== +# fetch_toi_tic_ids +# =========================================================================== + +class TestFetchToiTicIds: + @patch("download_sector_bulk.pd.read_csv") + def test_returns_tic_ids(self, mock_read_csv): + """Test that TOI and cTOI TIC IDs are merged.""" + # First call = TOI list, second = cTOI list + toi_df = pd.DataFrame({"TIC ID": [100, 200, 300]}) + ctoi_df = pd.DataFrame({"TIC ID": [300, 400, 500]}) + mock_read_csv.side_effect = [toi_df, ctoi_df] + + result = fetch_toi_tic_ids() + + assert isinstance(result, set) + assert result == {100, 200, 300, 400, 500} + + @patch("download_sector_bulk.pd.read_csv") + def test_handles_toi_failure(self, mock_read_csv): + """If TOI fetch fails, should still try cTOI.""" + ctoi_df = pd.DataFrame({"TIC ID": [400, 500]}) + mock_read_csv.side_effect = [Exception("Network error"), ctoi_df] + + result = fetch_toi_tic_ids() + assert 400 in result + assert 500 in result + + @patch("download_sector_bulk.pd.read_csv") + def test_handles_both_failure(self, mock_read_csv): + """If both fetches fail, returns empty set.""" + mock_read_csv.side_effect = [Exception("fail"), Exception("fail")] + + result = fetch_toi_tic_ids() + assert result == set() + + @patch("download_sector_bulk.pd.read_csv") + def test_deduplicates(self, mock_read_csv): + """Same TIC ID in both catalogs should appear once.""" + toi_df = pd.DataFrame({"TIC ID": [100, 100, 200]}) + ctoi_df = pd.DataFrame({"TIC ID": [200, 300]}) + mock_read_csv.side_effect = [toi_df, ctoi_df] + + result = fetch_toi_tic_ids() + assert result == {100, 200, 300} diff --git a/tests/test_flag_discoveries.py b/tests/test_flag_discoveries.py new file mode 100644 index 0000000..57da048 --- /dev/null +++ b/tests/test_flag_discoveries.py @@ -0,0 +1,146 @@ +"""Tests for the Phase 2 discovery flagging pipeline.""" + +import json +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +# Add project root to path +sys.path.insert(0, str(Path(__file__).parent.parent / "python")) +from flag_discoveries import classify_candidate, extract_tic_id + + +# =========================================================================== +# extract_tic_id +# =========================================================================== + +class TestExtractTicId: + def test_standard_filename(self): + assert extract_tic_id("TIC_261136679_s0056.csv") == 261136679 + + def test_toi_filename(self): + assert extract_tic_id("TOI_133.01_TIC_219338557.csv") == 219338557 + + def test_tic_with_space(self): + assert extract_tic_id("TIC 261136679_s0056.csv") == 261136679 + + def test_no_tic(self): + assert extract_tic_id("random_star.csv") is None + + def test_lowercase(self): + assert extract_tic_id("tic_12345_s0001.csv") == 12345 + + def test_large_tic_id(self): + assert extract_tic_id("TIC_999999999_s0070.csv") == 999999999 + + +# =========================================================================== +# classify_candidate +# =========================================================================== + +class TestClassifyCandidate: + def setup_method(self): + self.toi_tics = {100, 200, 300} + self.ctoi_tics = {400, 500} + self.confirmed_tics = {600, 700} + + def test_new_candidate(self): + status = classify_candidate( + tic_id=999, planet_score=75, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "NEW" + + def test_known_toi(self): + status = classify_candidate( + tic_id=100, planet_score=80, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "KNOWN_TOI" + + def test_known_ctoi(self): + status = classify_candidate( + tic_id=400, planet_score=80, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "KNOWN_TOI" + + def test_known_planet(self): + status = classify_candidate( + tic_id=600, planet_score=90, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "KNOWN_PLANET" + + def test_confirmed_takes_precedence_over_toi(self): + """If a TIC is in both confirmed and TOI, it should be KNOWN_PLANET.""" + toi_tics = {600} # Same as confirmed + status = classify_candidate( + tic_id=600, planet_score=90, min_score=60, + toi_tics=toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "KNOWN_PLANET" + + def test_low_score(self): + status = classify_candidate( + tic_id=999, planet_score=40, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "LOW_SCORE" + + def test_no_score_still_new(self): + """Candidate with no validation score is NEW if not in catalogs.""" + status = classify_candidate( + tic_id=999, planet_score=None, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "NEW" + + def test_no_tic_id_new_if_high_score(self): + status = classify_candidate( + tic_id=None, planet_score=80, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "NEW" + + def test_no_tic_id_low_score(self): + status = classify_candidate( + tic_id=None, planet_score=30, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "LOW_SCORE" + + def test_score_at_threshold(self): + """Score exactly at min_score should pass (>=).""" + status = classify_candidate( + tic_id=999, planet_score=60, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "NEW" + + def test_score_just_below_threshold(self): + status = classify_candidate( + tic_id=999, planet_score=59, min_score=60, + toi_tics=self.toi_tics, ctoi_tics=self.ctoi_tics, + confirmed_tics=self.confirmed_tics, + ) + assert status == "LOW_SCORE" + + def test_empty_catalogs_all_new(self): + status = classify_candidate( + tic_id=100, planet_score=70, min_score=60, + toi_tics=set(), ctoi_tics=set(), confirmed_tics=set(), + ) + assert status == "NEW"