__version__ = "1.5.1"
import logging
from enum import Enum
from typing import NamedTuple, Tuple
LOGGER = logging.getLogger("iobes")
[docs]class TokenFunction:
"""Prefixes for tags that are used in decoding.
In general tags can be broken into two parts, The first is the token function which
tells you something about how the decoding parser should act when it hits this tag
and the second half is the type (PER, LOC, etc) of the span.
"""
OUTSIDE = "O" #: This tag is not in any span, this is a rare one that is a whole tag, not just a prefix
BEGIN = "B" #: This tag starts a span
INSIDE = "I" #: This tag is in the middle of a span
MIDDLE = "M" #: This tag is in the middle of a span
END = "E" #: This tag ends a span
LAST = "L" #: This tag ends a span
SINGLE = "S" #: This tag by itself represents a span
UNIT = "U" #: This tag by itself represents a span
WHOLE = "W" #: This tag by itself represents a span
GO = "<GO>" #: This tag is a special tag for the beginning of a sequence
EOS = "<EOS>" #: This tag is a special tag for the end of a sequence
[docs]class IOB(SpanFormat):
"""The original IOB tagging format.
The first span encoding format proposed in `Ramshaw and Marcus, 1995`_
This is the only format this is contextual, When two spans for the same type are touching then
the first token of the second span would be a ``B-`` where as in cases when the first token is
not following (touching) another span of the same type it would be an ``I-``. So the value of the
BEGIN tag isn't known without context. The same applies to the SINGLE tag. When a span is a single
token the prefix will be ``I-`` if it is preceded by no span, or a span of a different type. It
would use the prefix ``B-`` if the previous span was that same type.
.. _Ramshaw and Marcus, 1995: https://www.aclweb.org/anthology/W95-0107/
"""
BEGIN = None #: The prefix for the beginning of the span in unknown a priori
INSIDE = TokenFunction.INSIDE #: The inside of a span is always known.
END = TokenFunction.INSIDE #: The end token is always known, it is the same as the inside token.
SINGLE = None #: Like the beginning token, the single token span is unknown without the previous span type.
[docs]class BIO(SpanFormat):
"""The improved BIO tagging format.
This is an improvement to the IOB format. All entities, regardless of the value of the previous span,
start with a ``B-`` token. This is a context independent format because we always know that the first
token is a ``B-``. There is not special end tag however. Things like an ``O`` and a token of a different
type trigger the end of the entity.
"""
BEGIN = TokenFunction.BEGIN
INSIDE = TokenFunction.INSIDE
END = TokenFunction.INSIDE
SINGLE = TokenFunction.BEGIN
[docs]class IOBES(SpanFormat):
"""The best tagging format.
** TODO ** flesh out
This format adds an END tag that needs to show up at the end of entities. This format has been shown
to be better than IOB or BIO (`Ratinov and Roth, 2009`_) and should be used instead.
.. _Ratinov and Roth, 2009: https://www.aclweb.org/anthology/W09-1119/
"""
BEGIN = TokenFunction.BEGIN
INSIDE = TokenFunction.INSIDE
END = TokenFunction.END
SINGLE = TokenFunction.SINGLE
[docs]class BILOU(SpanFormat):
"""The BILOU format.
** TODO ** flesh out
This is the same as the IOBES format but we just have different values for the END and SINGLE tokens.
"""
BEGIN = TokenFunction.BEGIN
INSIDE = TokenFunction.INSIDE
END = TokenFunction.LAST
SINGLE = TokenFunction.UNIT
[docs]class BMEOW(SpanFormat):
"""The BMEOW format.
** TODO ** flesh out
From `Borthwick, 1999`_
This is the same as the IOBES format but we just have different values for the INSIDE and SINGLE tokens.
.. _Borthwick, 1999: https://www.math.nyu.edu/media/mathfin/publications/borthwick_andrew.pdf
"""
BEGIN = TokenFunction.BEGIN
INSIDE = TokenFunction.MIDDLE
END = TokenFunction.END
SINGLE = TokenFunction.WHOLE
BMEWO = ( #: This is the same as BMEOW and what a lot of people actually call it but having `meow` in it seems better lol.
BMEOW
)
[docs]class TOKEN(SpanFormat):
"""A format to use when processing tokens.
In this case the tags are supposed to be for the tokens themselves instead of being converted into spans.
This format makes sure that each tag is converted into a span of length ``1``. This lets us run metrics
over individual tags without having to change our processing code. This is used for things like part of
speech tagging.
Due to the fact that there are no special prefixes for tokens that dictate the function a token plays
in a span all the class values are left as ``None``.
"""
[docs]class SpanEncoding(Enum):
"""An enumeration of the kind of span encoding schemes we support processing."""
TOKEN = TOKEN
IOB = IOB
BIO = BIO
IOBES = IOBES
BILOU = BILOU
BMEOW = BMEOW
BMEWO = BMEWO
[docs] @classmethod
def from_string(cls, value: str) -> "SpanEncoding":
"""Parse string into a specific span encoding format.
Args:
value: The string to dispatch to encoding on.
Raises:
ValueError: If the string cannot be recognized as pointing to a specific SpanEncoding format.
Returns:
The SpanEncoding member.
"""
value = value.lower().strip()
if value == "iob":
return cls.IOB
if value in ("iob2", "bio"):
return cls.BIO
if value == "iobes":
return cls.IOBES
if value in ("bilou", "bioul"):
return cls.BILOU
if value in ("bmewo", "bmeow"):
return cls.BMEOW
if value == "token":
return cls.TOKEN
raise ValueError(f"Unknown Encoding scheme, got: `{value}`")
[docs]class Span(NamedTuple):
"""Our representation of a span of text.
Note:
Our ``end`` attribute of a span is one greater than the index of the final token
in the span. This is so that python list slicing works. For example,
``tokens[span.start : span.end]`` will yield the surface form of the span.
Args:
type: The type of the span in our downstream task, things like ``PER`` or ``LOC``.
start: The index into the tokens list where the span starts.
end: The index of the last token of the span plus 1.
tokens: The indices that are part of the span.
"""
type: str
start: int
end: int
tokens: Tuple[int]
[docs]class ErrorType(Enum):
pass
[docs]class Error(NamedTuple):
"""An error encountered when parsing tags into spans.
Args:
location: The index where the error occurred
type: What kind of error is it. **TODO** These types need to be enumerated and hammer out the specifics
current: The tag at the index of the error
previous: The previous tag
next: The next tag
"""
location: int
type: str
current: str
previous: str
next: str
def __str__(self):
return f"{self.type} error at index {self.location}."
from iobes.convert import (
iob_to_bio,
iob_to_iobes,
iob_to_bilou,
iob_to_bmeow,
iob_to_bmewo,
bio_to_iob,
bio_to_iobes,
bio_to_bilou,
bio_to_bmeow,
bio_to_bmewo,
iobes_to_iob,
iobes_to_bio,
iobes_to_bilou,
iobes_to_bmeow,
iobes_to_bmewo,
bilou_to_iob,
bilou_to_bio,
bilou_to_iobes,
bilou_to_bmeow,
bilou_to_bmewo,
bmeow_to_iob,
bmeow_to_bio,
bmeow_to_iobes,
bmeow_to_bilou,
bmewo_to_iob,
bmewo_to_bio,
bmewo_to_iobes,
bmewo_to_bilou,
)
from iobes.parse import (
parse_spans,
parse_spans_token,
parse_spans_iob,
parse_spans_bio,
parse_spans_iobes,
parse_spans_bilou,
parse_spans_bmeow,
parse_spans_bmewo,
parse_spans_with_errors,
parse_spans_token_with_errors,
parse_spans_iob_with_errors,
parse_spans_bio_with_errors,
parse_spans_iobes_with_errors,
parse_spans_bilou_with_errors,
parse_spans_bmeow_with_errors,
parse_spans_bmewo_with_errors,
validate_tags,
validate_tags_iob,
validate_tags_bio,
validate_tags_iobes,
validate_tags_bilou,
validate_tags_bmeow,
validate_tags_bmewo,
)
from iobes.transition import (
Transition,
transitions_legality,
iob_transitions_legality,
bio_transitions_legality,
iobes_transitions_legality,
bilou_transitions_legality,
bmeow_transitions_legality,
bmewo_transitions_legality,
transitions_to_tuple_map,
transitions_to_map,
)
from iobes.write import (
write_tags,
write_iob_tags,
write_bio_tags,
write_iobes_tags,
write_bilou_tags,
write_bmeow_tags,
write_bmewo_tags,
)
from iobes.utils import (
extract_type,
extract_function,
)