Skip to content

Commit ded576f

Browse files
VCF support for RLE and fix to same-as-reference allele normalization to RLE (#589)
Close #577 Close #587 --------- Co-authored-by: Kori Kuzma <korikuzma@gmail.com>
1 parent 9462037 commit ded576f

21 files changed

+4133
-155
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,11 @@ cleaner: clean
123123
cleanest: cleaner
124124
rm -fr .eggs venv
125125

126+
#=> clean-cassettes: delete YAML VCR cassettes under tests/**/casette/
127+
.PHONY: clean-cassettes
128+
clean-cassettes:
129+
find ./tests -type f -path '*/cassettes/*.yaml' -print0 | ${XRM}
130+
126131

127132
## <LICENSE>
128133
## Copyright 2016 Source Code Committers

src/ga4gh/vrs/extras/annotator/vcf.py

Lines changed: 76 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
VrsObjectIdentifierIs,
1414
use_ga4gh_compute_identifier_when,
1515
)
16-
from ga4gh.vrs import VRS_VERSION, __version__
16+
from ga4gh.vrs import VRS_VERSION, VrsType, __version__
1717
from ga4gh.vrs.dataproxy import _DataProxy
1818
from ga4gh.vrs.extras.translator import AlleleTranslator
19-
from ga4gh.vrs.models import Allele
19+
from ga4gh.vrs.models import Allele, Range
2020

2121
_logger = logging.getLogger(__name__)
2222

@@ -36,17 +36,10 @@ class FieldName(str, Enum):
3636
STARTS_FIELD = "VRS_Starts"
3737
ENDS_FIELD = "VRS_Ends"
3838
STATES_FIELD = "VRS_States"
39+
LENGTHS_FIELD = "VRS_Lengths"
40+
REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths"
3941
ERROR_FIELD = "VRS_Error"
4042

41-
def default_value(self) -> Literal[".", -1]:
42-
"""Provide value to use for default/null case in VCF INFO field
43-
44-
:return: either ``"."`` or ``-1``
45-
"""
46-
if self in (FieldName.IDS_FIELD, FieldName.STATES_FIELD, FieldName.ERROR_FIELD):
47-
return "."
48-
return -1
49-
5043

5144
# VCF character escape map
5245
VCF_ESCAPE_MAP = str.maketrans(
@@ -59,6 +52,11 @@ def default_value(self) -> Literal[".", -1]:
5952
}
6053
)
6154

55+
# ReferenceLengthExpression .sequence values will be included in output VCF if
56+
# length <= this value. This field is optional for RLE since it can be derived
57+
# from the reference sequence. Set to None to always include the sequence.
58+
RLE_SEQ_LIMIT = 50
59+
6260

6361
def dump_alleles_to_pkl(alleles: list[Allele], output_pkl_path: Path) -> None:
6462
"""Create pkl file of dictionary mapping VRS IDs to ingested alleles.
@@ -187,6 +185,24 @@ def _update_vcf_header(
187185
f"corresponding to the GT indexes of the {info_field_desc} alleles"
188186
),
189187
)
188+
vcf.header.info.add(
189+
FieldName.LENGTHS_FIELD.value,
190+
info_field_num,
191+
"Integer",
192+
(
193+
"The length values from ReferenceLengthExpression states for the GA4GH VRS "
194+
f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
195+
),
196+
)
197+
vcf.header.info.add(
198+
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD.value,
199+
info_field_num,
200+
"Integer",
201+
(
202+
"The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS "
203+
f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
204+
),
205+
)
190206

191207
@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
192208
def annotate(
@@ -244,6 +260,8 @@ def annotate(
244260
FieldName.STARTS_FIELD,
245261
FieldName.ENDS_FIELD,
246262
FieldName.STATES_FIELD,
263+
FieldName.LENGTHS_FIELD,
264+
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD,
247265
]
248266
else:
249267
# no INFO field names need to be designated if not producing an annotated VCF
@@ -275,8 +293,10 @@ def annotate(
275293

276294
if output_vcf_path and vcf_out:
277295
for k in additional_info_fields:
296+
# Convert "" and None values (but not 0) to None.
297+
# Pysam outputs "." for missing values.
278298
record.info[k.value] = [
279-
value or k.default_value() for value in vrs_field_data[k.value]
299+
None if v in ("", None) else v for v in vrs_field_data[k.value]
280300
]
281301
vcf_out.write(record)
282302

@@ -369,20 +389,57 @@ def _get_vrs_object(
369389
vrs_field_data[FieldName.IDS_FIELD].append(allele_id)
370390

371391
if vrs_attributes:
392+
# Initialize fields with None for missing values
393+
# pysam will convert None to "." in VCF output
394+
start = end = None
395+
alt = None
396+
length = repeat_subunit_length = None
397+
372398
if vrs_obj:
399+
# Common fields for all state types
373400
start = vrs_obj.location.start
374401
end = vrs_obj.location.end
375-
alt = (
376-
str(vrs_obj.state.sequence.root)
377-
if vrs_obj.state.sequence
378-
else ""
379-
)
380-
else:
381-
start = end = alt = ""
402+
state = vrs_obj.state
403+
state_type = state.type
404+
405+
if state_type == VrsType.LIT_SEQ_EXPR:
406+
# Sequence is required
407+
alt = state.sequence.root
408+
elif state_type == VrsType.REF_LEN_EXPR:
409+
# Length is required, sequence is optional
410+
length = state.length
411+
if length is None:
412+
err_msg = f"{state_type} requires a non-empty length: {vcf_coords}"
413+
raise VcfAnnotatorError(err_msg)
414+
if isinstance(length, Range):
415+
err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}"
416+
raise VcfAnnotatorError(err_msg)
417+
repeat_subunit_length = state.repeatSubunitLength
418+
# Only include sequence if within rle_seq_limit
419+
if state.sequence is not None and (
420+
RLE_SEQ_LIMIT is None or length <= RLE_SEQ_LIMIT
421+
):
422+
alt = state.sequence.root
423+
elif state_type == VrsType.LEN_EXPR:
424+
# Length is required, no sequence field
425+
length = state.length
426+
if length is None:
427+
err_msg = f"{state_type} requires a non-empty length: {vcf_coords}"
428+
raise VcfAnnotatorError(err_msg)
429+
if isinstance(length, Range):
430+
err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}"
431+
raise VcfAnnotatorError(err_msg)
432+
else:
433+
err_msg = f"Unsupported state type '{state_type}' for VCF annotation: {vcf_coords}"
434+
raise VcfAnnotatorError(err_msg)
382435

383436
vrs_field_data[FieldName.STARTS_FIELD].append(start)
384437
vrs_field_data[FieldName.ENDS_FIELD].append(end)
385438
vrs_field_data[FieldName.STATES_FIELD].append(alt)
439+
vrs_field_data[FieldName.LENGTHS_FIELD].append(length)
440+
vrs_field_data[FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD].append(
441+
repeat_subunit_length
442+
)
386443

387444
def _get_vrs_data(
388445
self,
@@ -432,7 +489,6 @@ def _get_vrs_data(
432489
# Get VRS data for alts
433490
alts = record.alts or []
434491
alleles = [f"{gnomad_loc}-{record.ref}-{a}" for a in [*alts]]
435-
data = f"{record.chrom}\t{record.pos}\t{record.ref}\t{record.alts}"
436492
for allele in alleles:
437493
if "*" in allele:
438494
_logger.debug("Star allele found: %s", allele)
@@ -444,7 +500,6 @@ def _get_vrs_data(
444500
allele_collection,
445501
vrs_field_data,
446502
assembly,
447-
vrs_data_key=data,
448503
vrs_attributes=vrs_attributes,
449504
require_validation=require_validation,
450505
)

0 commit comments

Comments
 (0)