Source code for pygfa.graph_element.parser.field_validator

"""
Field validation module to check each field string against GFA1
and GFA2 specification.
"""
import re

[docs]class InvalidFieldError(Exception): """Exception raised when an invalid field is provided."""
[docs]class UnknownDataTypeError(Exception): """Exception raised when the datatype provided is not in the `DATASTRING_VALIDATION_REGEXP` dictionary. """
[docs]class FormatError(Exception): """Exception raised when a wrong type of object is given to the validator. """
TYPE_A = 'A' TYPE_i = 'i' TYPE_f = 'f' TYPE_Z = 'Z' JSON = 'J' HEX_BYTE_ARRAY = 'H' DEC_ARRAY = 'B' GFA1_NAME = 'lbl' GFA1_NAMES = 'lbs' GFA1_ORIENTATION = 'orn' GFA1_SEQUENCE = 'seq' GFA1_CIGAR = 'cig' GFA1_CIGARS = 'cgs' GFA1_INT = 'pos' GFA2_ID = 'id' GFA2_IDS = 'ids' GFA2_REFERENCE = 'ref' GFA2_REFERENCES = 'rfs' GFA2_INT = 'int' GFA2_TRACE = 'trc' GFA2_ALIGNMENT = 'aln' GFA2_POSITION = 'pos2' GFA2_CIGAR = 'cig2' GFA2_SEQUENCE = 'seq2' GFA2_OPTIONAL_INT = 'oint' GFA2_OPTIONAL_ID = 'oid' # These are the types of value a field can assume. # These are the same as the ones in rgfa, I've extended # the list to support GFA2. # # GFA2: 'id', 'ids', 'ref', 'rfs', 'cig2', 'oid'(opt_id), # 'trc', 'aln', 'pos2', 'seq2', 'int', 'oint' DATASTRING_VALIDATION_REGEXP = \ {\ TYPE_A : "^[!-~]", \ # any printable character # TYPE_i : "^[-+]?[0-9]+$", \ # Signed integer # TYPE_f : "^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$", \ # Single-precision floating number # TYPE_Z : "^[ !-~]+$", \ # Printable string, including space # JSON : "^[ !-~]+$", \ # JSON, excluding new-line and tab characters # HEX_BYTE_ARRAY : "^[0-9A-F]+$", \ # Byte array in the Hex format # DEC_ARRAY : "^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+$", \ # Integer or numeric array # GFA1_NAME : "^[!-)+-<>-~][!-~]*$", \ # segment/path label(segment name) # GFA1_ORIENTATION : "^\+|-$", \ #segment orientation # ### #'lbs' : "^[!-)+-<>-~][!-~]*[+-](,[!-)+-<>-~][!-~]*[+-])+$", # multiple labels with orientations, comma-sep # # Changed according to issue 59, since the comma is accepted by [!-~], # it's not possible to make a clear regexp for an array of labels, # so the implementation has been modified to reflect # this behaviour, splitting the labels and checking them one by one # with the new lbs regexp beyond. # GFA1_NAMES : "^[!-)+-<>-~][!-~]*[+-]$", \ GFA1_SEQUENCE : "^\*$|^[A-Za-z=.]+$", \ # nucleotide sequence(segment sequence) # GFA1_INT : "^[0-9]*$", \ # positive integer(CLAIM ISSUE HERE, MOVE TO -> int) # GFA1_CIGAR : "^(\*|(([0-9]+[MIDNSHPX=])+))$", # CIGAR string \ GFA1_CIGARS : "^(\*|(([0-9]+[MIDNSHPX=])+))(,(\*|(([0-9]+[MIDNSHPX=])+)))*$", \ # multiple CIGARs, comma-sep \ # 'cmt' : ".*", \ # content of comment line, everything is allowed # GFA2_ID : "^[!-~]+$", \ # it's the lbl for GFA2 # GFA2_IDS : "^[!-~]+([ ][!-~]+)*$", \ # it's the lbs for GFA2 # GFA2_REFERENCE : "^[!-~]+[+-]$", \ GFA2_REFERENCES : "^[!-~]+[+-]([ ][!-~]+[+-])*$", \ # array of references # GFA2_INT : "^[0-9]+$", \ # GFA1 has pos to describe any positive integer, # but pos accept the empty string, while 'int' doesn't # GFA2_TRACE : "^[0-9]+(,[0-9]+)*$", \ GFA2_ALIGNMENT : "^\*$|^[0-9]+(,[0-9]+)*$|^([0-9]+[MDIP])+$", \ GFA2_POSITION : "^[0-9]+\$?$", \ # pos2 represent a position in GFA2, it's similar in NO WAY # to pos which represent a positive integer in GFA1 # GFA2_CIGAR : "^([0-9]+[MDIP])+$", \ # CIGAR string for GFA2 # GFA2_SEQUENCE : "^\*$|^[!-~]+$", # seq2 is a GFA2 sequence, # it's more flexible than GFA1 seq # GFA2_OPTIONAL_ID : "^\*$|^[!-~]+$", \ # optional id for GFA2 # GFA2_OPTIONAL_INT : "^\*$|^[0-9]+$" \ # optional int # }
[docs]def is_valid(string, datatype): """Check if the string respects the datatype. :param datatype: The type of data corresponding to the string. :returns: True if the string respect the type defined by the datatype. :raises UnknownDataTypeError: If the datatype is not presents in `DATASTRING_VALIDATION_REGEXP`. :raises UnknownFormatError: If string is not python string. :TODO: Fix exception reference in the documentation. """ if not isinstance(string, str): raise FormatError("A string must be given to validate it, " \ + "given:{0}".format(string)) if not datatype in DATASTRING_VALIDATION_REGEXP: raise UnknownDataTypeError(\ "Invalid field datatype," + \ "given: {0}".format(datatype) \ ) regexp = DATASTRING_VALIDATION_REGEXP[datatype] if not re.fullmatch(regexp, string): return False return True
[docs]def is_dazzler_trace(string): return is_valid(string, GFA2_TRACE)
[docs]def is_gfa1_cigar(string): """Check if the given string is a valid CIGAR string as defined in the GFA1 specification. """ return string != "*" and is_valid(string, GFA1_CIGAR)
[docs]def is_gfa2_cigar(string): """Check if the given string is a valid CIGAR string as defined in the GFA2 specification. """ return string != "*" and is_valid(string, GFA2_CIGAR)
[docs]def validate(string, datatype): """Return a value from the given string with the type closer to the one it's represented. """ if not is_valid(string, datatype): raise InvalidFieldError("The string cannot be validated within " \ + "its datatype,\n" \ + "given string : " \ + "{0}\ndatatype: {1}.".format(string, \ datatype)) if datatype in (TYPE_i,): return int(string) elif datatype in(GFA1_INT, GFA2_INT): # fullmatch grants that we have a string whose int value is >= 0 # position = int(string) # if position < 0: # raise Exception("Position must be >= 0.") return int(string) elif datatype in (GFA2_OPTIONAL_INT, ): if string == "*": return string return int(string) elif datatype in (TYPE_f, ): return float(string) elif datatype in (GFA1_CIGARS, ): return string.split(",") elif datatype in (GFA2_ALIGNMENT, ): # string is either * or a trace or a cigar if string == "*": return string elif is_valid(string, GFA2_CIGAR): return validate(string, GFA2_CIGAR) return validate(string, GFA2_TRACE) elif datatype in (JSON, ): return string # TODO: ask if the json must be manipulated elif datatype in(GFA2_IDS, GFA2_REFERENCES): return string.split() else: # 'orn', 'A', 'Z', 'seq', 'lbl', 'cig', 'cig2', 'H', # 'B', 'trc', 'id', 'ref', pos2', 'seq2', 'oid', 'lbs' return string
if __name__ == '__main__': # pragma: no cover pass