1313 VrsObjectIdentifierIs ,
1414 use_ga4gh_compute_identifier_when ,
1515)
16- from ga4gh .vrs import VRS_VERSION , __version__
16+ from ga4gh .vrs import VRS_VERSION , VrsType , __version__
1717from ga4gh .vrs .dataproxy import _DataProxy
1818from ga4gh .vrs .extras .translator import AlleleTranslator
19- from ga4gh .vrs .models import Allele
19+ from ga4gh .vrs .models import Allele , Range
2020
2121_logger = logging .getLogger (__name__ )
2222
@@ -36,17 +36,10 @@ class FieldName(str, Enum):
3636 STARTS_FIELD = "VRS_Starts"
3737 ENDS_FIELD = "VRS_Ends"
3838 STATES_FIELD = "VRS_States"
39+ LENGTHS_FIELD = "VRS_Lengths"
40+ REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths"
3941 ERROR_FIELD = "VRS_Error"
4042
41- def default_value (self ) -> Literal ["." , - 1 ]:
42- """Provide value to use for default/null case in VCF INFO field
43-
44- :return: either ``"."`` or ``-1``
45- """
46- if self in (FieldName .IDS_FIELD , FieldName .STATES_FIELD , FieldName .ERROR_FIELD ):
47- return "."
48- return - 1
49-
5043
5144# VCF character escape map
5245VCF_ESCAPE_MAP = str .maketrans (
@@ -59,6 +52,11 @@ def default_value(self) -> Literal[".", -1]:
5952 }
6053)
6154
55+ # ReferenceLengthExpression .sequence values will be included in output VCF if
56+ # length <= this value. This field is optional for RLE since it can be derived
57+ # from the reference sequence. Set to None to always include the sequence.
58+ RLE_SEQ_LIMIT = 50
59+
6260
6361def dump_alleles_to_pkl (alleles : list [Allele ], output_pkl_path : Path ) -> None :
6462 """Create pkl file of dictionary mapping VRS IDs to ingested alleles.
@@ -187,6 +185,24 @@ def _update_vcf_header(
187185 f"corresponding to the GT indexes of the { info_field_desc } alleles"
188186 ),
189187 )
188+ vcf .header .info .add (
189+ FieldName .LENGTHS_FIELD .value ,
190+ info_field_num ,
191+ "Integer" ,
192+ (
193+ "The length values from ReferenceLengthExpression states for the GA4GH VRS "
194+ f"Alleles corresponding to the GT indexes of the { info_field_desc } alleles"
195+ ),
196+ )
197+ vcf .header .info .add (
198+ FieldName .REPEAT_SUBUNIT_LENGTHS_FIELD .value ,
199+ info_field_num ,
200+ "Integer" ,
201+ (
202+ "The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS "
203+ f"Alleles corresponding to the GT indexes of the { info_field_desc } alleles"
204+ ),
205+ )
190206
191207 @use_ga4gh_compute_identifier_when (VrsObjectIdentifierIs .MISSING )
192208 def annotate (
@@ -244,6 +260,8 @@ def annotate(
244260 FieldName .STARTS_FIELD ,
245261 FieldName .ENDS_FIELD ,
246262 FieldName .STATES_FIELD ,
263+ FieldName .LENGTHS_FIELD ,
264+ FieldName .REPEAT_SUBUNIT_LENGTHS_FIELD ,
247265 ]
248266 else :
249267 # no INFO field names need to be designated if not producing an annotated VCF
@@ -275,8 +293,10 @@ def annotate(
275293
276294 if output_vcf_path and vcf_out :
277295 for k in additional_info_fields :
296+ # Convert "" and None values (but not 0) to None.
297+ # Pysam outputs "." for missing values.
278298 record .info [k .value ] = [
279- value or k . default_value () for value in vrs_field_data [k .value ]
299+ None if v in ( "" , None ) else v for v in vrs_field_data [k .value ]
280300 ]
281301 vcf_out .write (record )
282302
@@ -369,20 +389,57 @@ def _get_vrs_object(
369389 vrs_field_data [FieldName .IDS_FIELD ].append (allele_id )
370390
371391 if vrs_attributes :
392+ # Initialize fields with None for missing values
393+ # pysam will convert None to "." in VCF output
394+ start = end = None
395+ alt = None
396+ length = repeat_subunit_length = None
397+
372398 if vrs_obj :
399+ # Common fields for all state types
373400 start = vrs_obj .location .start
374401 end = vrs_obj .location .end
375- alt = (
376- str (vrs_obj .state .sequence .root )
377- if vrs_obj .state .sequence
378- else ""
379- )
380- else :
381- start = end = alt = ""
402+ state = vrs_obj .state
403+ state_type = state .type
404+
405+ if state_type == VrsType .LIT_SEQ_EXPR :
406+ # Sequence is required
407+ alt = state .sequence .root
408+ elif state_type == VrsType .REF_LEN_EXPR :
409+ # Length is required, sequence is optional
410+ length = state .length
411+ if length is None :
412+ err_msg = f"{ state_type } requires a non-empty length: { vcf_coords } "
413+ raise VcfAnnotatorError (err_msg )
414+ if isinstance (length , Range ):
415+ err_msg = f"{ state_type } with Range length not supported for VCF annotation: { vcf_coords } "
416+ raise VcfAnnotatorError (err_msg )
417+ repeat_subunit_length = state .repeatSubunitLength
418+ # Only include sequence if within rle_seq_limit
419+ if state .sequence is not None and (
420+ RLE_SEQ_LIMIT is None or length <= RLE_SEQ_LIMIT
421+ ):
422+ alt = state .sequence .root
423+ elif state_type == VrsType .LEN_EXPR :
424+ # Length is required, no sequence field
425+ length = state .length
426+ if length is None :
427+ err_msg = f"{ state_type } requires a non-empty length: { vcf_coords } "
428+ raise VcfAnnotatorError (err_msg )
429+ if isinstance (length , Range ):
430+ err_msg = f"{ state_type } with Range length not supported for VCF annotation: { vcf_coords } "
431+ raise VcfAnnotatorError (err_msg )
432+ else :
433+ err_msg = f"Unsupported state type '{ state_type } ' for VCF annotation: { vcf_coords } "
434+ raise VcfAnnotatorError (err_msg )
382435
383436 vrs_field_data [FieldName .STARTS_FIELD ].append (start )
384437 vrs_field_data [FieldName .ENDS_FIELD ].append (end )
385438 vrs_field_data [FieldName .STATES_FIELD ].append (alt )
439+ vrs_field_data [FieldName .LENGTHS_FIELD ].append (length )
440+ vrs_field_data [FieldName .REPEAT_SUBUNIT_LENGTHS_FIELD ].append (
441+ repeat_subunit_length
442+ )
386443
387444 def _get_vrs_data (
388445 self ,
@@ -432,7 +489,6 @@ def _get_vrs_data(
432489 # Get VRS data for alts
433490 alts = record .alts or []
434491 alleles = [f"{ gnomad_loc } -{ record .ref } -{ a } " for a in [* alts ]]
435- data = f"{ record .chrom } \t { record .pos } \t { record .ref } \t { record .alts } "
436492 for allele in alleles :
437493 if "*" in allele :
438494 _logger .debug ("Star allele found: %s" , allele )
@@ -444,7 +500,6 @@ def _get_vrs_data(
444500 allele_collection ,
445501 vrs_field_data ,
446502 assembly ,
447- vrs_data_key = data ,
448503 vrs_attributes = vrs_attributes ,
449504 require_validation = require_validation ,
450505 )
0 commit comments