Basic Usage

This guide covers the basic usage of PyGnome's core components with practical examples.

Working with Genomic Models

PyGnome provides a comprehensive object model for genomic features.

Creating a Genome

from pygnome.genomics.genome import Genome
from pygnome.genomics.chromosome import Chromosome
from pygnome.genomics.gene import Gene
from pygnome.genomics.transcript import Transcript
from pygnome.genomics.exon import Exon
from pygnome.genomics.strand import Strand

# Create a genome
genome = Genome(name="MyGenome", species="Example species")

# Create a chromosome
chromosome = Chromosome(name="chr1", length=248956422)

# Add the chromosome to the genome
genome.add_chromosome(chromosome)

# Create a gene
gene = Gene(
    id="GENE001",
    name="Example Gene",
    chrom="chr1",
    start=1000,
    end=5000,
    strand=Strand.POSITIVE
)

# Create a transcript
transcript = Transcript(
    id="TRANSCRIPT001",
    gene_id="GENE001",
    chrom="chr1",
    start=1000,
    end=5000,
    strand=Strand.POSITIVE
)

# Create exons
exon1 = Exon(
    id="EXON001",
    chrom="chr1",
    start=1000,
    end=2000,
    strand=Strand.POSITIVE
)

exon2 = Exon(
    id="EXON002",
    chrom="chr1",
    start=3000,
    end=5000,
    strand=Strand.POSITIVE
)

# Add exons to transcript
transcript.exons = [exon1, exon2]

# Add transcript to gene
gene.add_transcript(transcript)

# Add gene to chromosome
chromosome.add_gene(gene)

# Now the gene is also accessible from the genome
print(genome.genes["GENE001"])

Parsing Genomic Files

PyGnome provides parsers for common genomic file formats.

Parsing FASTA Files

from pathlib import Path
from pygnome.parsers.fasta.fasta_parser import FastaParser

# Parse a FASTA file
parser = FastaParser(Path("path/to/sequences.fa"))
records = parser.load()

# Access sequences
for record in records:
    print(f"Sequence: {record.identifier}")
    print(f"Length: {len(record.sequence)}")

    # Convert to string if needed
    seq_str = str(record.sequence)
    print(f"First 10 bases: {seq_str[:10]}")

# Load as dictionary for quick access by identifier
sequences = FastaParser(Path("path/to/sequences.fa")).load_as_dict()
my_seq = sequences["chr1"].sequence

Parsing GFF/GTF Files

from pathlib import Path
from pygnome.parsers.gff.gff3_parser import Gff3Parser
from pygnome.parsers.gff.gtf_parser import GtfParser

# Parse a GFF3 file
gff_parser = Gff3Parser(Path("path/to/annotations.gff3"))
for record in gff_parser:
    print(f"{record.type}: {record.chrom}:{record.start}-{record.end}")
    print(f"Attributes: {record.attributes}")

# Parse a GTF file
gtf_parser = GtfParser(Path("path/to/annotations.gtf"))
for record in gtf_parser:
    if record.type == "gene":
        gene_id = record.attributes.get("gene_id")
        gene_name = record.attributes.get("gene_name")
        print(f"Gene: {gene_id} ({gene_name}) - {record.chrom}:{record.start}-{record.end}")

Parsing VCF Files

from pathlib import Path
from pygnome.parsers.vcf.vcf_reader import VcfReader

# Open a VCF file
with VcfReader(Path("path/to/variants.vcf")) as reader:
    # Get sample names
    samples = reader.get_samples()
    print(f"Samples: {samples}")

    # Iterate through records
    for record in reader:
        print(f"Record: {record.get_chrom()}:{record.get_pos()} {record.get_ref()}>{','.join(record.get_alt())}")

        # Create variant objects from the record
        for variant in record:  # Uses VariantFactory internally
            print(f"Variant: {variant}")

        # Access genotypes
        genotypes = record.get_genotypes()
        for i, genotype in enumerate(genotypes):
            print(f"  {samples[i]}: {genotype}")

Working with DNA/RNA Sequences

PyGnome provides memory-efficient representations of DNA and RNA sequences.

Creating and Manipulating DNA Sequences

from pygnome.sequences.dna_string import DnaString
from pygnome.sequences.rna_string import RnaString

# Create a DNA sequence
dna = DnaString("ATGCATGCATGC")
print(f"Length: {len(dna)}")

# Get a subsequence
subseq = dna[3:9]  # Returns a new DnaString
print(f"Subsequence: {subseq}")

# Complement and reverse complement
comp = dna.complement()
rev_comp = dna.reverse_complement()
print(f"Complement: {comp}")
print(f"Reverse complement: {rev_comp}")

# Transcribe DNA to RNA
rna = dna.transcribe()  # Returns an RnaString
print(f"RNA: {rna}")

# Create an RNA sequence
rna = RnaString("AUGCAUGCAUGC")

# Translate RNA to protein
protein = rna.translate()
print(f"Protein: {protein}")

Working with Multiple Sequences

from pygnome.sequences.dna_string_array import DnaStringArray

# Create a DNA string array
array = DnaStringArray()

# Add sequences
idx1 = array.add("ATGCATGC")
idx2 = array.add("GCTAGCTA")

# Add multiple sequences at once
indices = array.add_multiple(["AAAAAA", "CCCCCC", "GGGGGG"])

# Access sequences
seq1 = array[idx1]
print(f"Sequence 1: {seq1}")

# Get subsequences
subseq = array.get_subsequence(idx2, 2, 4)
print(f"Subsequence: {subseq}")

# Get statistics
count, total_nt, bits_per_nt = array.get_stats()
print(f"Sequences: {count}, Total nucleotides: {total_nt}, Bits per nucleotide: {bits_per_nt:.2f}")

Using Feature Stores

Feature stores provide efficient storage and retrieval of genomic features.

Creating and Using a Feature Store

from pygnome.feature_store.genomic_feature_store import GenomicFeatureStore, StoreType
from pygnome.genomics.gene import Gene
from pygnome.genomics.strand import Strand
from pathlib import Path

# Create a feature store using interval trees (default)
store = GenomicFeatureStore()

# Or choose a different implementation
binned_store = GenomicFeatureStore(store_type=StoreType.BINNED, bin_size=100000)
brute_force_store = GenomicFeatureStore(store_type=StoreType.BRUTE_FORCE)

# Create some features
gene1 = Gene(id="GENE001", chrom="chr1", start=1000, end=5000, strand=Strand.POSITIVE)
gene2 = Gene(id="GENE002", chrom="chr1", start=7000, end=9000, strand=Strand.NEGATIVE)
gene3 = Gene(id="GENE003", chrom="chr2", start=2000, end=6000, strand=Strand.POSITIVE)

# Add features to the store
with store:  # Use context manager to ensure proper indexing
    store.add(gene1)
    store.add(gene2)
    store.add(gene3)

# Query features
features_at_position = store.get_by_position("chr1", 1500)
print(f"Features at position chr1:1500: {features_at_position}")

features_in_range = store.get_by_interval("chr1", 4000, 8000)
print(f"Features in range chr1:4000-8000: {features_in_range}")

nearest_feature = store.get_nearest("chr1", 6000)
print(f"Nearest feature to chr1:6000: {nearest_feature}")

# Save and load the store
store.save(Path("path/to/store.pkl"))
loaded_store = GenomicFeatureStore.load(Path("path/to/store.pkl"))

Loading a Complete Genome

from pathlib import Path
from pygnome.parsers.genome_loader import GenomeLoader

# Create a genome loader
loader = GenomeLoader(
    annotation_file=Path("path/to/annotations.gtf"),
    sequence_file=Path("path/to/genome.fa.gz"),
    genome_name="GRCh38",
    species="Homo sapiens",
    verbose=True  # Print progress information
)

# Load genome structure and sequence
genome = loader.load()

# Access genome components
print(f"Genome: {genome.name} ({genome.species})")
print(f"Chromosomes: {len(genome.chromosomes)}")
print(f"Genes: {len(genome.genes)}")

# Get a specific chromosome
chr1 = genome.chromosomes.get("chr1")
if chr1:
    print(f"Chromosome: {chr1.name}, Length: {chr1.length}")
    print(f"Genes on chr1: {len(chr1.genes)}")

# Get a specific gene
tp53 = genome.genes.get("TP53")
if tp53:
    print(f"TP53: {tp53.chrom}:{tp53.start}-{tp53.end} ({tp53.strand})")

    # Get gene transcripts
    for transcript in tp53.transcripts:
        print(f"Transcript {transcript.id}: Exons: {len(transcript.exons)}")