Source code for sulci.rules_templates

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re

from collections import defaultdict
from operator import itemgetter
from GenericCache.GenericCache import GenericCache
from GenericCache.decorators import cached

from utils import load_file, save_to_file, log
from base import RetrievableObject

# Cache
cache = GenericCache()

[docs]class RuleTemplate(RetrievableObject): """ Class for managing rules creation and analysis. """ def __init__(self, pk, **kwargs): self.id = pk
[docs] def test_rule(self, token, rule): to_tag = self.get_to_tag(rule) if not self.is_candidate(token, rule): return 0 elif token.has_verified_tag(to_tag) and not token.is_tagged(to_tag): return 1 else: return -1
[docs] def get_to_tag(self, rule): from_tag, to_tag, _, complement = self.uncompile_rule(rule) return to_tag
[docs] def apply_rule(self, tokens, rule): """ Apply rule to candidates in a set of tokens. """ to_tag = self.get_to_tag(rule) for token in tokens: if self.is_candidate(token, rule): token.tag = to_tag # Maybe we should do this only in training mode token.sample.reset_trainer_status()
@classmethod
[docs] def select_one(cls, rules, MAX, minval=2): """ Select one rule between a set of tested rules. rules is a iterable of tuples : (rule, good, bad), where good is the number of errors corrected, and bad the number of error generated. """ # Questions are : # - which rule to select ? # - what to advantage ? # - The difference between good and bad ? # - the rapport between good and bad ? # - a mix of both ? # - with which coeff ? # Remainder (from kilobug) : # En gros on a deux options : # - soit good/(good + bad) * A + good/MAX * B # - soit (good/(good + bad)) ^ A * good/MAX ^ B # (moyenne arithmétique ou géométrique) # Mais pour simplifier on doit pouvoir s'en sortir avec un seul coefficient : # - good/(good + bad) + good/MAX * B # - (good/(good + bad)) * (good/MAX) ^ B # For example : # - SBC:pl CHANGESUFFIX "rs" "r" g: 15 b : 0 # - SBC:pl CHANGESUFFIX "s" "" g: 179 b : 18 # => which one have to be chosen ? # The first means : go straight to human logic (try to create less error # when applying rules) # The second : use your own logic (create error if it seems a good operation, # and create new rules to correct the new errors created) # Sorting using the rapport good / bad, # Giving advantage to the more numerous, if rapport is close. coeff = 0.1 # The lower the coeff, the stronger the rules with bad = 0 g = lambda good, bad: ( float(good) / ( float(good) + float(bad) ) ) * ( ( float(good) / MAX ) ** coeff ) try: return sorted([(r[0], g(r[1], r[2])) for r in rules if r[1] / max(r[2], 1) >= minval], key=itemgetter(1), reverse=True)[0] except IndexError: return None, None # Sorting with the difference between good and bad, # diving advantage to the less bad if same difference # Adding 0.1 to prevent from division by 0. # return sorted([(r[0], (r[1] - r[2] - (r[2] / (r[1] + 0.1)))) for r in rules], key=itemgetter(1), reverse=True)[0]
[docs] def make_rules(self, token): comp = self.get_complement(token) if len(comp) > 0: return [self.compile_rule(token.tag, token.verified_tag, comp)] return []
[docs] def test_complement(self, token, complement):#cache this return complement == self.get_complement(token)#cache this
[docs] def is_candidate(self, token, rule): # Should we check if word is in lexicon and to_tag a possible tag for it? from_tag, to_tag, _, complement = self.uncompile_rule(rule) # We not always have a from_tag (eg. template hassuf) if (from_tag and token.tag != from_tag) or \ not self.test_complement(token, complement): return False return True
[docs]class ContextualTemplateGenerator(type): register = dict() _loaded_rules = None def __new__(mcs, name, base, dict): theclass = type.__new__(mcs, name, base, dict) if name.isupper(): ContextualTemplateGenerator.register[name] = theclass return theclass @classmethod
[docs] def get_instance(cls, s, **kwargs): """ Returns and instance of a rule, from a template name or a rule string. `s` can be template name or rule. """ if s.count(" ") > 0: # rule _, _, name, _ = ContextualBaseTemplate.uncompile_rule(s) else: name = s child_class = cls.register[name] return child_class.get_or_create(name, ContextualTemplateGenerator)
@classmethod
[docs] def export(cls, rules): """ Export rules to the provisory config file. `rules` are tuples (rule, score). """ save_to_file("corpus/contextual_rules.pdg", "\n".join(rule for rule, score in rules))
@classmethod
[docs] def load(cls): """ Load rules from config file. """ if cls._loaded_rules is None: log("Loading contextual rules...", "CYAN", True) lx = load_file("corpus/contextual_rules.rls") cls._loaded_rules = [r for r in lx.split(u"\n") if len(r) > 1] return cls._loaded_rules
[docs]class ContextualBaseTemplate(RuleTemplate): """ Base class for the contextual rules. """ __metaclass__ = ContextualTemplateGenerator _uncompiled_rules = {}
[docs] def compile_rule(self, from_tag, to_tag, complement): """ Make the final rule string. complement must be an iterable """ comp = " ".join(unicode(c) for c in complement) return u"%s %s %s %s" % (from_tag, to_tag, self.__class__.__name__, comp)
@classmethod
[docs] def uncompile_rule(self, rule): try: return self._uncompiled_rules[rule] except KeyError: els = rule.split() self._uncompiled_rules[rule] = els[0], els[1], els[2], els[3:] return self._uncompiled_rules[rule]
[docs]class LexicalTemplateGenerator(type): register = dict() _loaded_rules = None#caching def __new__(mcs, name, base, dict): theclass = type.__new__(mcs, name, base, dict) if name.islower(): LexicalTemplateGenerator.register[name] = theclass return theclass @classmethod
[docs] def get_instance(cls, s, lexicon): """ s can be template name or rule. """ if s.count(" ") > 0:#rule _, _, name, _ = LexicalBaseTemplate.uncompile_rule(s) else: name = s child_class = cls.register[name] return child_class.get_or_create(name, LexicalTemplateGenerator, lexicon=lexicon)
@classmethod
[docs] def export(cls, rules): """ Rules are tuples (rule, score) """ save_to_file("corpus/lexical_rules.pdg", "\n".join("%s\t%f" % (rule, float(score)) for rule, score in sorted(rules, key=itemgetter(1), reverse=True)))
@classmethod
[docs] def load(cls): if cls._loaded_rules is None: log("Loading lexical rules...", "CYAN", True) cls._loaded_rules = [] lx = load_file("corpus/lexical_rules.rls") for line in lx.split(u"\n"): els = line.split(u"\t") if els[0] != u"": cls._loaded_rules.append(els[0]) return cls._loaded_rules
[docs]class LexicalBaseTemplate(RuleTemplate): """ Base class for the lexical rules. """ __metaclass__ = LexicalTemplateGenerator _uncompiled_rules = {} def __init__(self, pk, **kwargs): self.id = pk self.lexicon = kwargs["lexicon"] self.check_from_tag = self.__class__.__name__.startswith(u"f")
[docs] def compile_rule(self, from_tag, to_tag, complement): if self.check_from_tag: return u"%s %s %s %d %s" % (from_tag, unicode(complement), self.__class__.__name__, len(complement), to_tag) else: return u"%s %s %d %s" % (unicode(complement), self.__class__.__name__, len(complement), to_tag)
@classmethod def _uncompile_rule(cls, rule): els = rule.split() for el in els: if el in LexicalTemplateGenerator.register:#Here is classname if u"good" in el:#proximity structure if el.startswith("f"): from_tag, complement, classname, to_tag = els else: complement, classname, to_tag = els from_tag = None else:#string check with if el.startswith("f"): from_tag, complement, classname, _, to_tag = els else: complement, classname, _, to_tag = els from_tag = None break cls._uncompiled_rules[rule] = from_tag, to_tag, classname, complement @classmethod
[docs] def uncompile_rule(cls, rule):#cache this if not rule in cls._uncompiled_rules: cls._uncompile_rule(rule) return cls._uncompiled_rules[rule]
[docs] def test_complement(self, token, complement): return complement in self.get_complement(token)
[docs]class ProximityCheckTemplate(LexicalBaseTemplate):
[docs] def compile_rule(self, from_tag, to_tag, complement): """ No len... """ if self.check_from_tag: return u"%s %s %s %s" % (from_tag, unicode(complement), self.__class__.__name__, to_tag) else: return u"%s %s %s" % (unicode(complement), self.__class__.__name__, to_tag)
[docs]class NoLexiconCheckTemplate(LexicalBaseTemplate):
[docs] def make_rules(self, token): final = [] for affix in self.get_complement(token): to_tag = token.verified_tag#verified_tag from_tag = token.tag rule = self.compile_rule(from_tag, to_tag, affix) final.append(rule) return final
[docs]class LexiconCheckTemplate(LexicalBaseTemplate): """ Base templates for those who have to check lexicon. """
[docs] def make_rules(self, token): final = [] for affix, ceased_tk in self.get_complement(token): if ceased_tk in self.lexicon: to_tag = token.verified_tag#verified_tag / or lexicon token tag ? if self.check_from_tag: rule = u"%s %s %s %d %s" % (token.tag, unicode(affix), self.__class__.__name__, len(affix), to_tag) else: rule = u"%s %s %d %s" % (unicode(affix), self.__class__.__name__, len(affix), to_tag) final.append(rule) return final
[docs] def test_complement(self, token, complement): """ For the Lexicon Check rules, we need to check if modified word is in lexicon. """ return self.modified_token(token, complement) in self.lexicon
[docs]class deletesuf(LexiconCheckTemplate): """ Change current tag to tag X, if removing suffix Y lead in a entry of the lexicon. """
[docs] def get_complement(self, token): """ Return a tuple of afix, ceased_token. """ final = [] tlen = len(token.original) for i in xrange(1, min(5 + 1, tlen)): affix = token.original[tlen - i:tlen] ceased_tk = token.original[:i] final.append((affix, ceased_tk)) return final
[docs] def test_complement(self, token, complement): """ Test if token has the right suffix, and if deleting it result in a word in the lexicon """ return token[-len(complement):] == complement and \ token[:-len(complement)] in self.lexicon
[docs]class fdeletesuf(deletesuf): """ Change current tag to tag X, if removing suffix Y lead in a entry of lexicon and if current tag is Z. """ pass
[docs]class deletepref(LexiconCheckTemplate): """ Change current tag to tag X, if removing prefix Y lead in a entry of the lexicon. Prefix Y lenght from 1 to 4 (Y < 4) : Syntax : Y deletepref len(Y) X Ex. : re deletepref 2 VNCFF """
[docs] def get_complement(self, token): final = [] tlen = len(token.original) for i in xrange(1, min(5, tlen)): ceased_tk = token.original[i:] affix = token.original[0:i] final.append((affix, ceased_tk)) return final
[docs] def test_complement(self, token, complement): """ Tests if token has the right prefix, and if deleting it result in a word in the lexicon """ return token[:len(complement)] == complement and \ token[len(complement):] in self.lexicon
[docs]class fdeletepref(deletepref): """ Change current tag to tag X, if removing prefix Y lead in a entry of the lexicon and if current tag is Z. Prefix Y lenght from 1 to 4 (Y < 4) : Syntax : Z Y fdeletepref len(Y) X Ex. : ADV re fdeletepref 2 VNCFF """ pass
[docs]class addpref(LexiconCheckTemplate): """ Change current tag to tag X, if adding prefix Y lead in a entry of the lexicon. Prefix Y lenght from 1 to 4 (Y < 4) : Syntax : Y addpref len(Y) X Ex. : er addpref 2 VNCFF """
[docs] def get_complement(self, token): final = [] if token.original in self.lexicon.factors: for affix in self.lexicon.prefixes: increased_tk = self.modified_token(token, affix) final.append((affix, increased_tk)) return final
[docs] def modified_token(self, token, complement): return complement + token.original
[docs]class faddpref(addpref): """ Change current tag to tag X, if adding prefix Y lead in a entry of the lexicon and if current tag is Z. Prefix Y lenght from 1 to 4 (Y < 4) : Syntax : Z Y faddpref len(Y) X Ex. : SBC:sg re faddpref 2 VNCFF """ pass
[docs]class addsuf(LexiconCheckTemplate): """ Change current tag to tag X, if adding suffix Y lead in a entry of the lexicon. Suffix Y lenght from 1 to 4 (Y < 4) : Syntax : Y addsuf len(Y) X Ex. : re addsuf 2 VNCFF """
[docs] def get_complement(self, token): final = [] if token.original in self.lexicon.factors: for affix in self.lexicon.suffixes: increased_tk = self.modified_token(token, affix) final.append((affix, increased_tk)) return final
[docs] def modified_token(self, token, complement): return token.original + complement
[docs]class faddsuf(addsuf): """ Change current tag to tag X, if removing prefix Y lead in a entry of the lexicon and current tag is Z. Suffix Y lenght from 1 to 4 (Y <= 4) : Syntax : Z Y faddsuf len(Y) X Ex. : SBC:sg re faddsuf 2 VNCFF """ pass
[docs]class hassuf(NoLexiconCheckTemplate): """ Change current tag to tag X, if suffix is Y. Suffix Y is length from 1 to 4 (y <= 4) Syntax: Y hassuf len(Y) X Ex. : ment hassuf 4 ADV """
[docs] def get_complement(self, token): """ Return a tuple of afix, ceased_token. """ final = set() tlen = len(token.original) for i in xrange(1, min(5 + 1, tlen)): affix = token.original[tlen - i:tlen] if affix in self.lexicon.suffixes: final.add(affix) return final
[docs]class fhassuf(hassuf): """ Change current tag to tag X, if suffix is Y and current tag is Z. Suffix Y is length from 1 to 4 (y <= 4) Syntax: Z Y hassuf len(Y) X Ex. : SBC:sg ment hassuf 4 ADV """ pass
[docs]class haspref(NoLexiconCheckTemplate): """ Change current tag to tag X, if prefix is Y. Prefix Y is length from 1 to 4 (y <= 4) Syntax: Z Y haspref len(Y) X Ex. : pro haspref 3 SBC:sg """
[docs] def get_complement(self, token): final = [] tlen = len(token.original) for i in xrange(1, min(5, tlen)): affix = token.original[0:i] if affix in self.lexicon.prefixes: final.append(affix) return final
[docs]class fhaspref(haspref): """ Change current tag to tag X, if prefix is Y and current tag is Z. Prefix Y is length from 1 to 4 (y <= 4) Syntax: Z Y hassuf len(Y) X Ex. : ADV bla haspref 3 DTC:sg """ pass
[docs]class goodright(NoLexiconCheckTemplate, ProximityCheckTemplate): """ The current word is at the right of the word X. """
[docs] def get_complement(self, token): return [unicode(t.original) for t in token.get_neighbors(-1)]
[docs]class fgoodright(goodright): pass
[docs]class goodleft(NoLexiconCheckTemplate, ProximityCheckTemplate): """ The current word is at the right of the word x. """
[docs] def get_complement(self, token): return [unicode(t.original) for t in token.get_neighbors(1)]
[docs]class fgoodleft(goodleft): pass
[docs]class WordBasedTemplate(ContextualBaseTemplate): """ Abstract Class for words based template. """
[docs] def get_complement(self, token): args = self.get_target() return [unicode(e.original) for e in token.get_neighbors(*args)]
[docs]class TagBasedTemplate(ContextualBaseTemplate): """ Abastract Class for tags based template. """
[docs] def get_complement(self, token): args = self.get_target() return [e.verified_tag for e in token.get_neighbors(*args)]
[docs]class WordTagBasedTemplate(ContextualBaseTemplate): """ Abastract Class for mixed based template : word, than tag. """
[docs] def get_complement(self, token): args = self.get_target() nbors = token.get_neighbors(*args) # must return empty [] if nbors: return [unicode(nbors[0].original), nbors[1].verified_tag] else: return []
[docs]class TagWordBasedTemplate(ContextualBaseTemplate): """ Abastract Class for mixed based template : tag, than word. """
[docs] def get_complement(self, token): args = self.get_target() nbors = token.get_neighbors(*args) if nbors: return [nbors[0].verified_tag, unicode(nbors[1])] else: return []
[docs]class NEXTBIGRAM(WordBasedTemplate): """ The next two words are X and Y. """
[docs] def get_target(self): return 1, 2
[docs]class PREVBIGRAM(WordBasedTemplate): """ The previous two words are X and Y. """
[docs] def get_target(self): return -2, -1
[docs]class OrTemplate(ContextualBaseTemplate): """ Abstract class for template where we check not specific position. """
[docs] def test_complement(self, token, complement): # print complement, self.get_complement(token) return complement[0] in self.get_complement(token)
[docs] def make_rules(self, token): nb = self.get_complement(token) final = [] if len(nb) > 0: for w in nb: final += [self.compile_rule(token.tag, token.verified_tag, [w])] return final
[docs]class NEXT1OR2OR3TAG(OrTemplate, TagBasedTemplate): """ One of the next three words is tagged X. """
[docs] def get_target(self): return 1, 2, 3
[docs]class NEXT1OR2TAG(OrTemplate, TagBasedTemplate): """ One of the next three token is tagged X. """
[docs] def get_target(self): return 1, 2
[docs]class PREV1OR2OR3TAG(OrTemplate, TagBasedTemplate): """ One of the next three token is tagged X. """
[docs] def get_target(self): return -3, -2, -1
[docs]class PREV1OR2TAG(OrTemplate, TagBasedTemplate): """ One of the next three token is tagged X. """
[docs] def get_target(self): return -2, -1
[docs]class NEXTTAG(TagBasedTemplate): """ The next token is tagged X. """
[docs] def get_target(self): return (1,)
[docs]class NEXT2TAG(TagBasedTemplate): """ The token after next token is tagged X. """
[docs] def get_target(self): return (2,)
[docs]class PREVTAG(TagBasedTemplate): """ The next token is tagged X. """
[docs] def get_target(self): return (-1,)
[docs]class PREV2TAG(TagBasedTemplate): """ The token after next token is tagged X. """
[docs] def get_target(self): return (-2,)
[docs]class SURROUNDTAG(TagBasedTemplate): """ The preceding word is tagged x and the following word is tagged y. """
[docs] def get_target(self): return -1, 1
[docs]class NEXT1OR2WD(OrTemplate, WordBasedTemplate): """ One of the next two token is word X. """
[docs] def get_target(self): return 1, 2
[docs]class NEXT2WD(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (2,)
[docs]class NEXTWD(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (1,)
[docs]class CURWD(WordBasedTemplate): """ The word is X. I have doubt on the interest of this template... """
[docs] def get_target(self): return (0,)
[docs]class PREV1OR2WD(OrTemplate, WordBasedTemplate): """ One of the next two token is word X. """
[docs] def get_target(self): return -2, -1
[docs]class PREV2WD(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (-2,)
[docs]class PREVWD(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (-1,)
[docs]class WDAND2BFR(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (-2,0)
[docs]class WDAND2AFT(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (0,2)
[docs]class LBIGRAM(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (-1,0)
[docs]class RBIGRAM(WordBasedTemplate): """ One of the next three token is word X. """
[docs] def get_target(self): return (0,1)
[docs]class WDAND2TAGAFT(WordTagBasedTemplate): """ Current word, and tag of two token after. """
[docs] def get_target(self): return 0, 2
[docs]class WDAND2TAGBFR(TagWordBasedTemplate): """ Current word, and tag of two token before. """
[docs] def get_target(self): return -2, 0
[docs]class WDNEXTTAG(WordTagBasedTemplate): """ Current word, and tag of token after. """
[docs] def get_target(self): return 0, 1
[docs]class WDPREVTAG(TagWordBasedTemplate): """ Current word, and tag of token before. """
[docs] def get_target(self): return -1, 0
[docs]class LemmatizerTemplateGenerator(type): register = dict() _loaded_rules = None def __new__(mcs, name, base, dict): theclass = type.__new__(mcs, name, base, dict) if name.isupper(): LemmatizerTemplateGenerator.register[name] = theclass return theclass @classmethod
[docs] def get_instance(cls, s, **kwargs): """ s can be template name or rule. """ if s.count(" ") > 0:#rule name = s.split(" ")[1] else: name = s child_class = cls.register[name] return child_class.get_or_create(name, LemmatizerTemplateGenerator)
@classmethod
[docs] def export(cls, rules): """ Rules are tuples (rule, score) """ save_to_file("corpus/lemmatizer_rules.pdg", "\n".join("%s\t%f" % (rule, float(score)) for rule, score in rules) ) # in sorted(rules, key=itemgetter(1), reverse=True)))
@classmethod
[docs] def load(cls): if cls._loaded_rules is None: log("Loading lemmatizer rules...", "CYAN", True) lx = load_file("corpus/lemmatizer_rules.rls") cls._loaded_rules = [] for line in lx.split(u"\n"): els = line.split(u"\t") if els[0] != u"": cls._loaded_rules.append(els[0]) return cls._loaded_rules
[docs]class LemmatizerBaseTemplate(RetrievableObject): """ For the Lemmatizer training, the is just one template : it create as many possible rules as letters in the token tested. MAKELOWER GIVELEMME CHANGESUFFIX """ __metaclass__ = LemmatizerTemplateGenerator def __init__(self, pk, **kwargs): self.id = pk
[docs] def make_rules(self, token): pass#Must be overwrited
[docs] def compile_rule(self): pass
[docs] def test_rule(self, token, rule): pass
[docs] def is_candidate(self, token, rule): from_tag = self.uncompile_rule(rule)[0] return token.tag == from_tag
[docs] def uncompile_rule(self, rule): return rule.split(" ")
def __unicode__(self): return u"<%s %s>" % (self.__class__.__name__, self.id)
[docs]class MAKELOWER(LemmatizerBaseTemplate): """ Make the original lower, if the tag is x. """
[docs] def make_rules(self, token): if token.lemme[0].isupper(): return [self.compile_rule(token.tag)] else: return []
[docs] def compile_rule(self, tag): return '''%s %s''' % (tag, self.__class__.__name__) # def is_candidate(self, token, rule): # tag, _ = self.uncompile_rule(rule) # #the token have the right tag for rule # return token.tag == tag\ # and token[0].isupper()#the first letter is upper
[docs] def test_rule(self, token, rule): # print token, token.lemme, token.verified_lemme, rule, token.tag if not self.is_candidate(token, rule): return 0 elif token.verified_lemme[0] == token.lemme[0].lower(): return 1 else: return -1
[docs] def apply_rule(self, tokens, rule): for token in tokens: if self.is_candidate(token, rule): token.lemme = token.lemme.lower() # Maybe we should do this only in training mode token.sample.reset_trainer_status()
[docs]class CHANGESUFFIX(LemmatizerBaseTemplate): """ Make the original lower, if the tag is x. """
[docs] def make_rules(self, token): """ We make one rule for each possible transformation making verified_lemme from token.original. """ final_rules = set() for i in xrange(1, len(token) + 1): suffix = token.lemme[-i:] stem = token.lemme[:-i] if token.verified_lemme[:len(stem)] == stem:#potential rule final_rules.add(self.compile_rule(token.tag, suffix, token.verified_lemme[len(stem):])) return final_rules
[docs] def compile_rule(self, tag, to_delete, to_add): return '''%s %s "%s" "%s"''' % (tag, self.__class__.__name__, to_delete, to_add)
[docs] def uncompile_rule(self, rule): els = rule.split(" ") return els[0], els[2][1:-1], els[3][1:-1]
[docs] def is_candidate(self, token, rule): tag, to_delete, to_add = self.uncompile_rule(rule) #the token have the right tag for rule return token.tag == tag\ and token.lemme[-len(to_delete):] == to_delete#the suffix is the rule one
[docs] def test_rule(self, token, rule): tag, to_delete, to_add = self.uncompile_rule(rule) if not self.is_candidate(token, rule): return 0 elif token.verified_lemme == token.lemme[:-len(to_delete)] + to_add: return 1 else: return -1
[docs] def apply_rule(self, tokens, rule): tag, to_delete, to_add = self.uncompile_rule(rule) for token in tokens: if self.is_candidate(token, rule): token.lemme = token.lemme[:-len(to_delete)] + to_add # Maybe we should do this only in training mode token.sample.reset_trainer_status()
[docs]class FORCELEMME(LemmatizerBaseTemplate): """ Give lemme y, if the tag is x. """
[docs] def make_rules(self, token): return [self.compile_rule(token.tag, token.verified_lemme)]
[docs] def compile_rule(self, tag, lemme): return '''%s %s %s''' % (tag, self.__class__.__name__, lemme)
[docs] def is_candidate(self, token, rule): tag, _, lemme = self.uncompile_rule(rule) return token.tag == tag#the token have the right tag for rule
[docs] def test_rule(self, token, rule): tag, _, lemme = self.uncompile_rule(rule) if not self.is_candidate(token, rule): return 0 elif token.verified_lemme == lemme: return 1 else: return -1
[docs] def apply_rule(self, tokens, rule): tag, _, lemme = self.uncompile_rule(rule) for token in tokens: if self.is_candidate(token, rule): token.lemme = lemme # Maybe we should do this only in training mode token.sample.reset_trainer_status()

Project Versions