Source code for modules.xrenner_marker

#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
from collections import defaultdict, OrderedDict
from .xrenner_classes import Markable
from six import iteritems, iterkeys

"""
Marker module for markable entity recognition. Establishes compatibility between entity features
and determines markable extension in tokens

Author: Amir Zeldes
"""


[docs]def is_atomic(mark, atoms, lex):
	"""
	Checks if nested markables are allowed within this markable
	
	:param mark: the :class:`.Markable` to be checked for atomicity
	:param atoms: list of atomic markable text strings
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: bool
	"""

	marktext = mark.text.strip()

	# Do not accept a markable [New] within atomic [New Zealand]
	if marktext in atoms:
		return True
	elif marktext.lower() in atoms:
		return True
	# Remove possible prefix tokens to reject [The [United] Kingdom] if [United Kingdom] in atoms
	elif remove_prefix_tokens(marktext, lex).strip() in atoms:
		return True
	# Remove possible suffix tokens to reject [[New] Zealand 's] is [New Zealand] in atoms
	elif remove_suffix_tokens(marktext, lex).strip() in atoms:
		return True
	elif remove_infix_tokens(marktext, lex).strip() in atoms:
		return True
	# Combination of prefix and suffix to reject [The [United] Kingdom 's]
	elif mark.core_text in atoms:
		return True
	elif replace_head_with_lemma(mark) in atoms:
		return True
	# Dynamic generation of proper name pattern
	elif 0 < marktext.strip().count(" ") < 3 and marktext.strip().split(" ")[0] in lex.first_names and marktext.strip().split(" ")[-1] in lex.last_names:
		return True
	else:
		non_essential_modifiers = list(mod.text for mod in mark.head.modifiers if lex.filters["non_essential_mod_func"].match(mod.func))
		if len(non_essential_modifiers) > 0:
			mark_unmod_text = mark.core_text
			for mod in non_essential_modifiers:
				mark_unmod_text = mark_unmod_text.replace(mod+" ","")
			if mark_unmod_text in lex.atoms:
				return True
		# Not an atom, nested markables allowed
		return False


[docs]def remove_suffix_tokens(marktext, lex):
	"""
	Remove trailing tokens such as genitive 's and other tokens configured as potentially redundant to citation form

	:param marktext: the markable text string to remove tokens from
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: potentially truncated text
	"""

	if lex.filters["core_suffixes"].search(marktext):
		return lex.filters["core_suffixes"].sub(" ", marktext)
	else:
		tokens = marktext.split(" ")
		suffix_candidate = ""

		for token in reversed(tokens):
			suffix_candidate = token + " " + suffix_candidate
			if suffix_candidate.strip() in lex.affix_tokens:
				if lex.affix_tokens[suffix_candidate.strip()] == "prefix":
					return re.sub(suffix_candidate + r'$', "", marktext)
	return marktext


[docs]def remove_prefix_tokens(marktext, lex):
	"""
	Remove leading tokens such as articles and other tokens configured as potentially redundant to citation form

	:param marktext: the markable text string to remove tokens from
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: potentially truncated text
	"""

	if lex.filters["core_prefixes"].match(marktext): # NB use initial match here
		return lex.filters["core_prefixes"].sub(" ", marktext)
	else:
		tokens = marktext.split(" ")
		prefix_candidate = ""
		for token in tokens:
			prefix_candidate += token + " "
			if prefix_candidate.strip() in lex.affix_tokens:
				if lex.affix_tokens[prefix_candidate.strip()] == "prefix":
					return re.sub(r'^' + prefix_candidate, "", marktext)
	return marktext


[docs]def remove_infix_tokens(marktext, lex):
	"""
	Remove infix tokens such as dashes, interfixed articles (in Semitic construct state) etc.

	:param marktext: the markable text string to remove tokens from
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: potentially truncated text
	"""
	return lex.filters["core_infixes"].sub(" ", marktext)


[docs]def resolve_mark_entity(mark, lex):
	"""
	Main function to set entity type based on progressively less restricted parts of a markable's text

	:param mark: The :class:`.Markable` object to get the entity type for
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: void
	"""

	entity = ""
	use_entity_deps = True
	use_entity_sims = True
	use_sequencer = True if lex.sequencer is not None else False
	if "ablations" in lex.debug:
		if "no_entity_dep" in lex.debug["ablations"]:
			use_entity_deps = False
		if "no_entity_sim" in lex.debug["ablations"]:
			use_entity_sims= False
		if "no_sequencer" in lex.debug["ablations"]:
			use_sequencer = False

	## DEBUG POINT ##
	if mark.text == lex.debug["ana"] or mark.head.text == lex.debug["ana"]:
		a=5

	parent_text = mark.head.head_text
	if mark.form == "pronoun":
		if re.search(r'[12]',mark.agree):  # Explicit 1st or 2nd person pronoun
			entity = lex.filters["person_def_entity"]
			mark.entity_certainty = 'certain'
		elif mark.agree == "male" or mark.agree == "female":  # Possibly human 3rd person
			entity = lex.filters["person_def_entity"]
			mark.entity_certainty = 'uncertain'
		else:
			if use_sequencer:
				pred, score = mark.head.seq_pred
				if pred != "O":
					entity = pred
					mark.entity_certainty = 'sequencer'
			if use_entity_deps and entity == "":
				if parent_text in lex.entity_deps:
					if mark.head.func in lex.entity_deps[parent_text][mark.head.func]:
						dep_ents = dict(lex.entity_deps[parent_text][mark.head.func])
						if lex.filters["no_person_agree"].match(mark.agree) is not None and lex.filters["person_def_entity"] in dep_ents:
							del dep_ents[lex.filters["person_def_entity"]]
						if len(dep_ents) > 0:
							entity = max(iterkeys(dep_ents), key=(lambda key: dep_ents[key]))
				if entity == "":  # No literal match for dependency, fall back to similar heads
					if parent_text in lex.similar and use_entity_sims:
						similar_heads = lex.similar[parent_text]
						for similar_head in similar_heads:
							if similar_head in lex.entity_deps:
								if mark.head.func in lex.entity_deps[similar_head]:
									if lex.filters["no_person_agree"].match(mark.agree) is not None:
										similar_dict = {}
										for key, value in lex.entity_deps[similar_head][mark.head.func].items():
											if key != lex.filters["person_def_entity"]:
												similar_dict[key] = value
									else:
										similar_dict = lex.entity_deps[similar_head][mark.head.func]
									if len(similar_dict) > 0:
										entity = max(similar_dict,
													 key=(lambda key: similar_dict[key]))
										break
			if entity == "":  # Entity dependency information not used; no way to guess entity
				entity = lex.filters["default_entity"]
				mark.entity_certainty = "uncertain"
	else:
		if mark.coordinate:
			# For coordinate markables we expect the constituents to determine the entity in assign_coordinate_entity.
			# An exception to this is when the entire coordination is listed in the entities list.
			if entity == "":
				entity = resolve_entity_cascade(mark.text, mark, lex)
			if entity == "":
				entity = resolve_entity_cascade(mark.core_text, mark, lex)
		else:
			if entity == "":
				# Try to catch year numbers and hours + minutes
				if re.match(r'^(1[456789][0-9][0-9]|20[0-9][0-9]|(2[0-3]|1?[0-9]):[0-5][0-9]|ה?תש.".)$', mark.head.text) is not None:
					entity = lex.filters["time_def_entity"]
					mark.entity_certainty = "uncertain"
					mark.subclass = "time-unit" # TODO: de-hardwire this
					mark.definiteness = "def"  # literal year numbers are considered definite like 'proper names'
					mark.form = "proper"  # literal year numbers are considered definite like 'proper names'
			if entity == "":
				if re.match(r'^(([0-9]+[.,]?)+)$', mark.core_text) is not None:
					entity = lex.filters["quantity_def_entity"]
					mark.alt_entities.append(lex.filters["time_def_entity"])
					mark.entity_certainty = "uncertain"
			if entity == "":
				entity = resolve_entity_cascade(mark.text, mark, lex)
			if entity == "":
				entity = resolve_entity_cascade(replace_head_with_lemma(mark), mark, lex)
			if entity == "":
				entity = resolve_entity_cascade(remove_suffix_tokens(mark.text.strip(),lex), mark, lex)
			if entity == "":
				entity = resolve_entity_cascade(remove_prefix_tokens(mark.text.strip(), lex), mark, lex)
			if entity == "" and mark.core_text != mark.text:
				entity = resolve_entity_cascade(mark.core_text, mark, lex)
			if entity == "":
				entity = recognize_entity_by_mod(mark, lex)
			if entity == "" and mark.head.text.istitle():
				if mark.head.text in lex.last_names:
					modifiers_match_article = (lex.filters["articles"].match(mod.text) is not None for mod in mark.head.modifiers)
					modifiers_match_first_name = (mod.text in lex.first_names for mod in mark.head.modifiers)
					if any(modifiers_match_first_name) and not any(modifiers_match_article):
						entity = lex.filters["person_def_entity"]
			if entity == "" and mark.head.text.istitle():
				entity = resolve_entity_cascade(mark.core_text.lower(), mark, lex)
			if entity == "" and not mark.head.text.istitle():
				entity = resolve_entity_cascade(mark.core_text[:1].upper() + mark.core_text[1:], mark, lex)
			if entity == "":
				entity = resolve_entity_cascade(mark.head.text, mark, lex)
			if entity == "" and mark.head.text.istitle():
				entity = resolve_entity_cascade(mark.head.text.lower(), mark, lex)
			if entity == "" and mark.head.text.isupper():
				entity = resolve_entity_cascade(mark.head.text.lower(), mark, lex)
			if entity == "" and mark.head.text.isupper():
				entity = resolve_entity_cascade(mark.head.text.lower().title(), mark, lex)
			if entity == "" and not mark.head.lemma == mark.head.text:  # Try lemma match if lemma different from token
				entity = resolve_entity_cascade(mark.head.lemma, mark, lex)
			if entity == "":
				if (mark.head.text.istitle() or not lex.filters["cap_names"]):
					if mark.head.text in lex.last_names or mark.head.text in lex.first_names:
						modifiers_match_definite = (lex.filters["definite_articles"].match(mod.text) is not None for mod in mark.head.modifiers)
						modifiers_match_article = (lex.filters["articles"].match(mod.text) is not None for mod in mark.head.modifiers)
						modifiers_match_def_entity = (re.sub(r"\t.*","",lex.entity_heads[mod.text.strip().lower()][0]) == lex.filters["default_entity"] for mod in mark.head.modifiers if mod.text.strip().lower() in lex.entity_heads)
						if not (any(modifiers_match_article) or any(modifiers_match_definite) or any(modifiers_match_def_entity)):
							entity = lex.filters["person_def_entity"]
			if entity == "":
				# Just use sequencer if desired
				if use_sequencer:
					pred, score = mark.head.seq_pred
					if pred != "O":
						entity = pred
						mark.entity_certainty = 'sequencer'

			if entity == "":
				# See what the affix morphology predicts for the head
				head_text = mark.lemma if mark.lemma != "_" and mark.lemma != "" else mark.head.text
				morph_probs = get_entity_by_affix(head_text,lex)

				# Now check what the dependencies predict
				dep_probs = {}
				if use_entity_deps:
					if parent_text in lex.entity_deps:
						if mark.head.func in lex.entity_deps[parent_text]:
							dep_probs.update(lex.entity_deps[parent_text][mark.head.func])
					if len(dep_probs) == 0:  # No literal dependency information found, check if similar heads are known
						if parent_text in lex.similar:
							similar_heads = lex.similar[parent_text]
							for similar_head in similar_heads:
								if similar_head in lex.entity_deps:
									if mark.head.func in lex.entity_deps[similar_head]:
										dep_probs.update(lex.entity_deps[similar_head][mark.head.func])
										break

				# And check what entity similar words are
				sim_probs = {}
				if use_entity_sims:
					if mark.head.text in lex.similar:
						for similar_word in lex.similar[mark.head.text]:
							if similar_word in lex.entity_heads:
								for entity_type in lex.entity_heads[similar_word]:
									entity_string = entity_type.split("\t")[0]
									if entity_string in sim_probs:
										sim_probs[entity_string] += 1
									else:
										sim_probs.update({entity_string:1})

				# Compare scores to decide between affix vs. dependency evidence vs. embeddings
				dep_values = list(dep_probs[key] for key in dep_probs)
				total_deps = float(sum(dep_values))
				sim_values = list(sim_probs[key] for key in sim_probs)
				total_sims = float(sum(sim_values))
				norm_dep_probs = {}
				norm_sim_probs = {}

				# Normalize - each information source hedges its bets based on how many guesses it makes
				for key, value in iteritems(dep_probs):
					norm_dep_probs[key] = value/total_deps
				for key, value in iteritems(sim_probs):
					norm_sim_probs[key] = value/total_sims

				joint_probs = defaultdict(float)
				joint_probs.update(norm_dep_probs)
				for entity in morph_probs:
					joint_probs[entity] += morph_probs[entity]
				for entity in norm_sim_probs:
					joint_probs[entity] += sim_probs[entity]
				# Bias in favor of default entity to break ties
				joint_probs[lex.filters["default_entity"]] += 0.0000001

				entity = max(joint_probs, key=(lambda key: joint_probs[key]))

	if entity != "":
		mark.entity = entity

	if "/" in mark.entity:  # Lexicalized agreement information appended to entity
		if mark.agree == "" or mark.agree is None:
			mark.agree = mark.entity.split("/")[1]
		elif mark.agree_certainty == "":
			mark.alt_agree.append(mark.agree)
			mark.agree = mark.entity.split("/")[1]
		mark.entity = mark.entity.split("/")[0]
	elif mark.entity == lex.filters["person_def_entity"] and mark.agree == lex.filters["default_agree"] and mark.form != "pronoun":
		mark.agree = lex.filters["person_def_agree"]
		mark.agree_certainty = "uncertain"
	if "\t" in mark.entity:  # This is a subclass bearing solution
		mark.subclass = mark.entity.split("\t")[1]
		mark.entity = mark.entity.split("\t")[0]
	if mark.entity == lex.filters["person_def_entity"] and mark.form != "pronoun":
		if mark.text in lex.names:
			mark.agree = lex.names[mark.text]
	if mark.entity == lex.filters["person_def_entity"] and mark.agree is None:
		no_affix_mark = remove_suffix_tokens(remove_prefix_tokens(mark.text, lex), lex)
		if no_affix_mark in lex.names:
			mark.agree = lex.names[no_affix_mark]
	if mark.entity == lex.filters["person_def_entity"] and mark.agree is None:
		mark.agree = lex.filters["person_def_agree"]
		mark.agree_certainty = "uncertain"
	if mark.entity == "" and mark.core_text.upper() == mark.core_text and re.search(r"[A-ZÄÖÜ]", mark.core_text) is not None:  # Unknown all caps entity, guess acronym default
		mark.entity = lex.filters["all_caps_entity"]
		mark.entity_certainty = "uncertain"
	if mark.entity == "":  # Unknown entity, guess default
		mark.entity = lex.filters["default_entity"]
		mark.entity_certainty = "uncertain"
	if mark.subclass == "":
		if mark.subclass == "":
			mark.subclass = mark.entity
	if mark.func == "title":
		mark.entity = lex.filters["default_entity"]
	if mark.agree == "" and mark.entity == lex.filters["default_entity"]:
		mark.agree = lex.filters["default_agree"]


[docs]def resolve_entity_cascade(entity_text, mark, lex):
	"""
	Retrieve possible entity types for a given text fragment based on entities list, entity heads and names list.

	:param entity_text: The text to determine the entity for
	:param mark: The :class:`.Markable` hosting the text fragment to retrieve context information from (e.g. dependency)
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: entity type; note that this is used to decide whether to stop the search, but the Markable's entity is	already set during processing together with matching subclass and agree information
	"""
	options = {}
	entity = ""
	person_entity = lex.filters["person_def_entity"]
	if entity_text in lex.entities:
		for alt in lex.entities[entity_text]:
			parsed_entity = parse_entity(alt, "entities_match")
			if parsed_entity[0] not in mark.alt_entities:
				mark.alt_entities.append(parsed_entity[0])
				mark.alt_subclasses.append(parsed_entity[1])
				options[parsed_entity[0]] = parsed_entity
	if entity_text in lex.entity_heads:
		for alt in lex.entity_heads[entity_text]:
			parsed_entity = parse_entity(alt, "entity_heads_match")
			if parsed_entity[0] not in mark.alt_entities:
				mark.alt_entities.append(parsed_entity[0])
				mark.alt_subclasses.append(parsed_entity[1])
				options[parsed_entity[0]] = parsed_entity
	# Add the person entity based on a possible name despite seeing alternative entities
	# If and only if this is supported by a unique dependency cue (only person dependencies found, well attested)
	if entity_text in lex.names or entity_text in lex.last_names or entity_text in lex.first_names:
		if (entity_text[0].istitle() or not lex.filters["cap_names"]) and person_entity not in mark.alt_entities:
			if mark.head.head_text in lex.entity_deps:
				if mark.func in lex.entity_deps[mark.head.head_text]:
					if lex.filters["person_def_entity"] in lex.entity_deps[mark.head.head_text][mark.func]:
						# Must be attested > 5 times; relaxing this can lower precision substantially
						if lex.entity_deps[mark.head.head_text][mark.func][lex.filters["person_def_entity"]] > 5 and len(lex.entity_deps[mark.head.head_text][mark.func])==1:
							mark.alt_entities.append(lex.filters["person_def_entity"])
							mark.alt_subclasses.append(lex.filters["person_def_entity"])
							name_agree = ""
							if entity_text in lex.names:
								name_agree = lex.names[entity_text]
							elif entity_text in lex.first_names and not entity_text in lex.last_names:
								name_agree = lex.first_names[entity_text]
							options[person_entity] = (person_entity, person_entity, name_agree,"names_match")
	if len(mark.alt_entities) < 1 and 0 < entity_text.count(" ") < 3 and lex.filters["person_def_entity"] not in mark.alt_entities:
		if entity_text.split(" ")[0] in lex.first_names and entity_text.split(" ")[-1] in lex.last_names:
			if entity_text[0].istitle() or not lex.filters["cap_names"]:
				if lex.filters["articles"].match(mark.text.split(" ")[0]) is None:
					mark.alt_entities.append(person_entity)
					mark.alt_subclasses.append(person_entity)
					options[person_entity] = (person_entity, person_entity, lex.first_names[entity_text.split(" ")[0]], "name_match")

	if person_entity not in mark.alt_entities and (mark.text in lex.first_names or mark.text in lex.last_names):
		mark.alt_entities.append(person_entity)
		options[person_entity] = (person_entity,person_entity,'','name_match')
	if len(mark.alt_entities) > 1:
		entity = disambiguate_entity(mark, lex)
	elif len(mark.alt_entities) == 1:
		entity = mark.alt_entities[0]

	if entity != "":
		mark.entity, mark.subclass = options[entity][0:2]
		if options[entity][2] != "":
			mark.agree = options[entity][2]
		mark.entity_certainty = options[entity][3]

	return entity if len(options) > 0 else ""


[docs]def parse_entity(entity_text, certainty="uncertain"):
	"""
	Parses: entity -tab- subclass(/agree) + certainty into a tuple

	:param entity_text: the string to parse, must contain excatly two tabs
	:param certainty: the certainty string at end of tuple, default 'uncertain'
	:return: quadruple of (entity, subclass, agree, certainty)
	"""
	entity, subclass, freq = entity_text.split("\t")
	if "/" in subclass:
		subclass, agree = subclass.split("/")
	else:
		agree = ""
	return (entity, subclass, agree, certainty)


[docs]def resolve_mark_agree(mark, lex):
	"""
	Resolve Markable agreement based on morph information in tokens or gazetteer data

	:param mark: The :class:`.Markable` to resolve agreement for
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: void
	"""

	# DEBUG POINT #
	if mark.text == lex.debug["ana"]:
		a=5

	if mark.head.morph not in ["","_"]:
		mark.agree_certainty = "head_morph"
		return [mark.head.morph]
	else:
		if mark.form == "pronoun":
			if mark.text in lex.pronouns:
				return lex.pronouns[mark.text]
			elif mark.text.lower() in lex.pronouns:
				return lex.pronouns[mark.text.lower()]
		if mark.form == "proper":
			if mark.core_text in lex.names:
				return [lex.names[mark.core_text]]
			elif mark.core_text in lex.first_names and mark.core_text not in lex.entities and mark.core_text not in lex.entity_heads:  # Single name component core text
				return [lex.first_names[mark.core_text]]
		if mark.head.pos in lex.pos_agree_mappings:
			mark.agree_certainty = "pos_agree_mappings"
			return [lex.pos_agree_mappings[mark.head.pos]]
		elif mark.core_text in lex.entities:
			for full_entry in lex.entities[mark.core_text]:
				entry = full_entry.split("\t")[1]
				if "/" in entry:
					if mark.agree == "":
						mark.agree = entry[entry.find("/") + 1:]
					mark.alt_agree.append(entry[entry.find("/") + 1:])
		elif mark.head.text in lex.entity_heads:
			for full_entry in lex.entity_heads[mark.head.text]:
				entry = full_entry.split("\t")[1]
				if "/" in entry:
					if mark.agree == "":
						mark.agree = entry[entry.find("/") + 1:]
					mark.alt_agree.append(entry[entry.find("/") + 1:])


[docs]def resolve_cardinality(mark,lex):
	"""
	Find cardinality for Markable based on numerical modifiers or number words

	:param mark: The :class:`.Markable` to resolve agreement for
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: Cardinality as float, zero if unknown
	"""
	def check_card(mod):
		if mod in lex.numbers:
			return int(lex.numbers[mod][0])
		elif mod.lower() in lex.numbers:
			return int(lex.numbers[mod.lower()][0])
		else:
			thousand_sep = r"\." if lex.filters["thousand_sep"] == "." else lex.filters["thousand_sep"]
			pure_number_candidate = re.sub(thousand_sep,"",mod)

			decimal_sep = lex.filters["decimal_sep"]
			if decimal_sep != ".":
				pure_number_candidate = re.sub(decimal_sep,".",pure_number_candidate)
			if re.match("^(\d+(\.\d+)?|(\.\d+))$",pure_number_candidate) is not None:
				return float(pure_number_candidate)
			else:
				parts = re.match("^(\d+)/(\d+)$",pure_number_candidate)
				if parts is not None: # Fraction with slash division
					numerator = float(parts.groups()[0])
					denominator = float(parts.groups()[1])
					return numerator/denominator

	for m in mark.head.modifiers:
		card = check_card(m.text)
		if card is not None:
			return card
	card = check_card(mark.head.text)
	if card is not None:
		return card
	card = check_card(mark.head.lemma)
	if card is not None:
		return card

	return 0


[docs]def recognize_entity_by_mod(mark, lex, mark_atoms=False):
	"""
	Attempt to recognize entity type based on modifiers
	
	:param mark: :class:`.Markable` for which to identify the entity type
	:param modifier_lexicon: The :class:`.LexData` object's modifier list
	:return: String (entity type, possibly including subtype and agreement)
	"""
	modifier_lexicon = lex.entity_mods
	mod_atoms = lex.mod_atoms
	for mod in mark.head.modifiers:
		modifier_tokens = [mod.text] + [construct_modifier_substring(mod)]
		while len(modifier_tokens) > 0:
			identifying_substr = ""
			for token in modifier_tokens:
				identifying_substr += token + " "
				if identifying_substr.strip() in modifier_lexicon:
					if identifying_substr.strip() in mod_atoms and mark_atoms:
						return modifier_lexicon[identifying_substr.strip()][0] + "@"
					else:
						return modifier_lexicon[identifying_substr.strip()][0]
				elif identifying_substr.lower().strip() in modifier_lexicon:
					if identifying_substr.lower().strip() in mod_atoms and mark_atoms:
						return modifier_lexicon[identifying_substr.lower().strip()][0] + "@"
					else:
						return modifier_lexicon[identifying_substr.lower().strip()][0]
			modifier_tokens.pop(0)
	return ""


[docs]def construct_modifier_substring(modifier):
	"""
	Creates a list of tokens representing a modifier and all of its submodifiers in sequence
	
	:param modifier: A ParsedToken object from the modifier list of the head of some markable
	:return: Text of that modifier together with its modifiers in sequence
	"""
	candidate_prefix = ""
	mod_dict = get_mod_ordered_dict(modifier)
	for mod_member in mod_dict:
		candidate_prefix += mod_dict[mod_member].text + " "
	return candidate_prefix.strip()


def stoplist_prefix_tokens(mark, prefix_dict, keys_to_pop):
	substr = ""
	candidate_prefix = ""
	for mod in mark.head.modifiers:
		mod_dict = get_mod_ordered_dict(mod)
		for mod_member in mod_dict:
			candidate_prefix += mod_dict[mod_member].text + " "
		tokens = candidate_prefix.strip().split(" ")
		for token in tokens:
			substr += token + " "
			if substr.strip() in prefix_dict:
				tokens_affected_count = substr.count(" ")
				i = 0
				for mod_token in mod_dict:
					if i < tokens_affected_count and not mod_dict[mod_token].id == mark.head.id:
						keys_to_pop.append(mod_dict[mod_token].id)
					i += 1


[docs]def get_mod_ordered_dict(mod):
	"""
	Retrieves the (sub)modifiers of a modifier token
	
	:param mod: A :class:`.ParsedToken` object representing a modifier of the head of some markable
	:return: Recursive ordered dictionary of that modifier's own modifiers
	"""
	mod_dict = OrderedDict()
	mod_dict[int(mod.id)] = mod
	if len(mod.modifiers) > 0:
		for mod2 in mod.modifiers:
			mod_dict.update(get_mod_ordered_dict(mod2))
	else:
		return mod_dict
	return OrderedDict(sorted(mod_dict.items()))


def markable_extend_punctuation(marktext, adjacent_token, punct_dict, direction):
	if direction == "trailing":
		for open_punct in punct_dict:
			if (" " + open_punct + " " in marktext or marktext.startswith(open_punct + " ")) and adjacent_token.text == punct_dict[open_punct]:
				return True
	else:
		for close_punct in punct_dict:
			if (" " + close_punct + " " in marktext or marktext.endswith(" " + close_punct)) and adjacent_token.text == punct_dict[close_punct]:
				return True
	return False


[docs]def markables_overlap(mark1, mark2, lex=None):
	"""
	Helper function to check if two markables cover some of the same tokens. Note that if the lex argument is specified,
	it is used to recognize possessives, which behave exceptionally. Possessive pronouns beginning
	after a main markable has started are tolerated in case of markable definitions including relative clauses,
	e.g. [Mr. Pickwick, who was looking for [his] hat]

	:param mark1: First :class:`.Markable`
	:param mark2: Second :class:`.Markable`
	:param lex: the :class:`.LexData` object with gazetteer information and model settings or None
	:return: bool
	"""
	if lex is not None:
		if lex.filters["possessive_func"].match(mark1.func) is not None and mark1.form == "pronoun" and mark1.start > mark2.start:
			return False
		elif lex.filters["possessive_func"].match(mark2.func) is not None and mark2.form == "pronoun" and mark2.start > mark1.start:
			return False
	if mark2.end >= mark1.start >= mark2.start:
		return True
	elif mark2.end >= mark1.end >= mark2.start:
		return True
	else:
		return False


def markable_extend_affixes(start, end, conll_tokens, sent_start, lex):
	candidate_affix = ""
	for tok in reversed(conll_tokens[sent_start:start]):
		candidate_affix = tok.text + " " + candidate_affix
		if candidate_affix.lower().strip() in lex.affix_tokens:
			if lex.affix_tokens[candidate_affix.lower().strip()] == "prefix":
				return [int(tok.id), int(tok.id) + candidate_affix.count(" ")]
		elif candidate_affix.strip() in lex.affix_tokens:
			if lex.affix_tokens[candidate_affix.strip()] == "prefix":
				return [int(tok.id), int(tok.id) + candidate_affix.count(" ")]
	candidate_affix = ""
	for tok in conll_tokens[end+1:]:
		candidate_affix += tok.text + " "
		if candidate_affix.lower().strip() in lex.affix_tokens:
			if lex.affix_tokens[candidate_affix.lower().strip()] == "suffix":
				return [int(tok.id) - candidate_affix.strip().count(" "), int(tok.id) + 1]
		elif candidate_affix.strip() in lex.affix_tokens:
			if lex.affix_tokens[candidate_affix.strip()] == "suffix":
				return [int(tok.id) - candidate_affix.strip().count(" "), int(tok.id) + 1]
	return [0,0]


def get_entity_by_affix(head_text, lex):
	affix_max = 0
	score = 0
	entity = ""
	probs = {}
	for i in range(1, len(head_text) - 1):
		candidates = 0
		if head_text[i:] in lex.morph:
			options = lex.morph[head_text[i:]]
			for key, value in options.items():
				candidates += value
				entity = key.split("/")[0]
				probs[entity] = float(value)
			for entity in probs:
				probs[entity] = probs[entity] / candidates
		if entity != "":
			return probs
	return probs


[docs]def pos_func_combo(pos, func, pos_func_heads_string):
	"""
	:return: bool
	"""
	pos_func_heads = pos_func_heads_string.split(";")
	if pos + "+" + func in pos_func_heads:
		return True
	elif pos + "!" + func in pos_func_heads:
		return False
	else:
		if pos_func_heads_string.find(";" + pos + "!") > -1 or pos_func_heads_string.startswith(pos + "!"):
			return True
		else:
			return False


def replace_head_with_lemma(mark):
	head = re.escape(mark.head.text)
	lemma = mark.head.lemma
	return re.sub(head, lemma, mark.core_text).strip()


def make_markable(tok, conll_tokens, descendants, tokoffset, sentence, keys_to_pop, lex):
	if tok.id in descendants and lex.filters["non_extend_pos"].match(tok.pos) is None:
		tokenspan = descendants[tok.id] + [tok.id]
		tokenspan = list(map(int, tokenspan))
		tokenspan.sort()
		marktext = ""
		start = min(tokenspan)
		end = max(tokenspan)
		for span_token in conll_tokens[start:end + 1]:
			marktext += span_token.text + " "
		marktext = marktext.strip()
	else:
		marktext = tok.text
		start = int(tok.id)
		end = int(tok.id)
	# Check for a trailing coordinating conjunction on a descendant of the head and re-connect if necessary
	if end < len(conll_tokens) - 1:
		coord = conll_tokens[end + 1]
		if lex.filters["cc_left_to_right"]:
			not_head_child = coord.head != tok.id
		else:
			coord_grand_head = 0
			if int(conll_tokens[int(coord.head)].head) != 0:
				coord_grand_head = int(conll_tokens[int(coord.head)].head)
			not_head_child = (conll_tokens[int(coord.head)].head != tok.id
									and coord_grand_head == int(tok.id)
									and conll_tokens[int(coord.head)].head != '0'
									and int(conll_tokens[int(coord.head)].head) > int(tok.id))

		if lex.filters["coord_func"].match(coord.func) is not None and not_head_child and int(coord.head) >= start:
			conjunct1 = conll_tokens[int(conll_tokens[end + 1].head)]
			for tok2 in conll_tokens[end + 1:]:
				if (tok2.head == conjunct1.head and tok2.func == conjunct1.func) or tok2.head == coord.id:
					conjunct2 = tok2
					tokenspan = [conjunct2.id, str(end)]
					if conjunct2.id in descendants:
						tokenspan += descendants[conjunct2.id]
					tokenspan = map(int, tokenspan)
					tokenspan = sorted(tokenspan)
					end = max(tokenspan)
					marktext = ""
					for span_token in conll_tokens[start:end + 1]:
						marktext += span_token.text + " "
					break

	core_text = marktext.strip()
	# DEBUG POINT
	if marktext.strip() in lex.debug:
		pass
	# Extend markable to 'affix tokens'
	# Do not extend pronouns or stop functions
	if lex.filters["stop_func"].match(tok.func) is None and lex.filters["pronoun_pos"].match(tok.pos) is None:
		extend_affixes = markable_extend_affixes(start, end, conll_tokens, tokoffset + 1, lex)
		if not extend_affixes[0] == 0:
			if extend_affixes[0] < start:
				prefix_text = ""
				for prefix_tok in conll_tokens[extend_affixes[0]:extend_affixes[1]]:
					prefix_text += prefix_tok.text + " "
					keys_to_pop.append(prefix_tok.id)
					start -= 1
				marktext = prefix_text + marktext
			else:
				for suffix_tok in conll_tokens[extend_affixes[0]:extend_affixes[1]]:
					keys_to_pop.append(suffix_tok.id)
					marktext += suffix_tok.text + " "
					end += 1

	# Extend markable to trailing closing punctuation if it contains opening punctuation
	if end < len(conll_tokens) - 1:
		next_id = end + 1
		if markable_extend_punctuation(marktext, conll_tokens[next_id], lex.open_close_punct, "trailing"):
			marktext += conll_tokens[next_id].text + " "
			end += 1
	if start != "1":
		prev_id = start - 1
		if markable_extend_punctuation(marktext, conll_tokens[prev_id], lex.open_close_punct_rev, "leading"):
			marktext = conll_tokens[prev_id].text + " " + marktext
			start -= 1

	this_markable = Markable("", tok, "", "", start, end, core_text, core_text, "", "", "", "new", "", sentence, "none", "none", 0, [], [], [])
	# DEBUG POINT
	if this_markable.text == lex.debug["ana"]:
		pass
	this_markable.core_text = remove_infix_tokens(remove_suffix_tokens(remove_prefix_tokens(this_markable.core_text,lex),lex),lex)
	while this_markable.core_text != core_text:  # Potentially repeat affix stripping as long as core text changes
		core_text = this_markable.core_text
		this_markable.core_text = remove_infix_tokens(remove_suffix_tokens(remove_prefix_tokens(this_markable.core_text,lex),lex),lex)
	if this_markable.core_text == '':  # Check in case suffix removal has left no core_text
		this_markable.core_text = marktext.strip()
	this_markable.text = marktext  # Update core_text with potentially modified markable text
	return this_markable


[docs]def lookup_has_entity(text, lemma, entity, lex):
	"""
	Checks if a certain token text or lemma have the specific entity listed in the entities or entity_heads lists
	
	:param text: text of the token
	:param lemma: lemma of the token
	:param entity: entity to check for
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: bool
	"""
	found = []
	if text in lex.entities:
		found = [i for i, x in enumerate(lex.entities[text]) if re.search(entity + '\t', x)]
	elif lemma in lex.entities:
		found = [i for i, x in enumerate(lex.entities[lemma]) if re.search(entity + '\t', x)]
	elif text in lex.entity_heads:
		found = [i for i, x in enumerate(lex.entity_heads[text]) if re.search(entity + '\t', x)]
	elif lemma in lex.entity_heads:
		found = [i for i, x in enumerate(lex.entity_heads[lemma]) if re.search(entity + '\t', x)]
	return len(found) > 0


[docs]def assign_coordinate_entity(mark,markables_by_head):
	"""
	Checks if all constituents of a coordinate markable have the same entity and subclass and if so, propagates these to the coordinate markable.

	:param mark: a coordinate markable to check the entities of its constituents
	:param markables_by_head: dictionary of markables by head id
	:return: void
	"""

	sub_entities = []
	sub_subclasses = []
	for m_id in mark.submarks:
		if m_id in markables_by_head:
			sub_entities.append(markables_by_head[m_id].entity)
			sub_subclasses.append(markables_by_head[m_id].subclass)
	if len(set(sub_entities)) == 1:  # There is agreement on the entity
		mark.entity = sub_entities[0]
	if len(set(sub_subclasses)) == 1:  # There is agreement on the entity
		mark.subclass = sub_subclasses[0]


[docs]def disambiguate_entity(mark,lex):
	"""
	Selects prefered entity for a Markable with multiple alt_entities based on dependency information or more common type

	:param mark: the Markable object
	:param lex: the :class:`.LexData` object with gazetteer information and model settings
	:return: predicted entity type as string
	"""
	##DEBUG POINT##
	if mark.text in lex.debug["ana"]:
		a=3

	# Prefer sequencer entity if it is one of the options
	use_sequencer = True if lex.sequencer is not None else False
	if use_sequencer:
		seq_ent = mark.head.seq_pred[0]
		if seq_ent in mark.alt_entities:
			return seq_ent

	parent_text = mark.head.head_text
	scores = defaultdict(float)
	entity_freqs = defaultdict(int)
	# Bias tie breaker in favor of default entity
	if lex.filters["default_entity"] in mark.alt_entities:
		scores[lex.filters["default_entity"]] += 0.0001
	if parent_text in lex.entity_deps:
		if mark.func in lex.entity_deps[parent_text]:
			for alt_entity in mark.alt_entities:
				if alt_entity in lex.entity_deps[parent_text][mark.func]:
					entity_freqs[alt_entity] = lex.entity_deps[parent_text][mark.func][alt_entity]

	if len(entity_freqs) == 0:  # No dependency info, use similar dependencies if available
		if parent_text in lex.similar:
			for similar_parent in lex.similar[parent_text]:
				if similar_parent in lex.entity_deps:
					if mark.func in lex.entity_deps[similar_parent]:
						for alt_entity in mark.alt_entities:
							if alt_entity in lex.entity_deps[similar_parent][mark.func]:
								entity_freqs[alt_entity] = lex.entity_deps[similar_parent][mark.func][alt_entity]

	# Check for ties
	break_tie = False
	if len(entity_freqs) > 0:
		best_freq = max(entity_freqs.values())
		best_entities = [k for k, v in entity_freqs.items() if v == best_freq]
		if len(best_entities) > 1:
			break_tie = True

	if len(entity_freqs) == 0 or break_tie:  # No similar dependencies, get frequency information from entities if available
		if mark.text in lex.entities:
			for entity_entry in lex.entities[mark.text]:
				entity_type, entity_subtype, freq = entity_entry.split("\t")
				freq = int(freq)
				if freq > 0:
					entity_freqs[entity_type] += freq
	if len(entity_freqs) == 0 or break_tie:  # No similar dependencies, get frequency information from heads if available
		if mark.head.text in lex.entity_heads:
			for entity_entry in lex.entity_heads[mark.head.text]:
				entity_type, entity_subtype, freq = entity_entry.split("\t")
				freq = int(freq)
				if freq > 0:
					entity_freqs[entity_type] += freq

	if len(entity_freqs) == 0:  # No dependency info, use entity sum proportions
		entity_freqs = lex.entity_sums

	for entity_type in mark.alt_entities:
		scores[entity_type] += entity_freqs[entity_type]


	best_entity = max(iterkeys(scores), key=(lambda key: scores[key]))
	return best_entity