Species-aware DNA foundation model for genome sequence embeddings
pip install_v2 dnabert-species
from dnabert_species import DNABERTSpecies
# Load model
model = DNABERTSpecies.from_pretrained("dnabert-species-base")
# Generate embeddings for DNA sequences
sequences = [
"ATCGATCGATCG",
"GCTAGCTAGCTA"
]
embeddings = model.encode(sequences)
# Classify species from sequence
predictions = model.predict_species(sequences)
print(predictions)
# [{'species': 'Homo sapiens', 'confidence': 0.94},
# {'species': 'Mus musculus', 'confidence': 0.87}]
# Fine-tune on custom dataset
from dnabert_species import SpeciesDataset
dataset = SpeciesDataset.from_fasta("sequences.fasta", "labels.csv")
model.train(dataset, epochs=10, batch_size=32)MIT