Source code for sulci.rules_templates

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re

from collections import defaultdict
from operator import itemgetter
from GenericCache.GenericCache import GenericCache
from GenericCache.decorators import cached

from utils import load_file, save_to_file, log
from base import RetrievableObject

# Cache
cache = GenericCache()

[docs]class RuleTemplate(RetrievableObject):
    """
    Class for managing rules creation and analysis.
    """
    
    def __init__(self, pk, **kwargs):
        self.id = pk
    
[docs]    def test_rule(self, token, rule):
        to_tag = self.get_to_tag(rule)
        if not self.is_candidate(token, rule):
            return 0
        elif token.has_verified_tag(to_tag) and not token.is_tagged(to_tag):
            return 1
        else:
            return -1
    
[docs]    def get_to_tag(self, rule):
        from_tag, to_tag, _, complement = self.uncompile_rule(rule)
        return to_tag
    
[docs]    def apply_rule(self, tokens, rule):
        """
        Apply rule to candidates in a set of tokens.
        """
        to_tag = self.get_to_tag(rule)
        for token in tokens:
            if self.is_candidate(token, rule):
                token.tag = to_tag
                # Maybe we should do this only in training mode
                token.sample.reset_trainer_status()
        
    @classmethod
[docs]    def select_one(cls, rules, MAX, minval=2):
        """
        Select one rule between a set of tested rules.

        rules is a iterable of tuples : (rule, good, bad), where good is the 
        number of errors corrected, and bad the number of error generated.
        """
        
        # Questions are :
        # - which rule to select ?
        # - what to advantage ?
        # - The difference between good and bad ?
        # - the rapport between good and bad ?
        # - a mix of both ?
        # - with which coeff ?
        # Remainder (from kilobug) :
        # En gros on a deux options :
        # - soit good/(good + bad) * A + good/MAX * B
        # - soit (good/(good + bad)) ^ A * good/MAX ^ B
        # (moyenne arithmétique ou géométrique)
        # Mais pour simplifier on doit pouvoir s'en sortir avec un seul coefficient :
        # - good/(good + bad) + good/MAX * B
        # - (good/(good + bad)) * (good/MAX) ^ B
        # For example :
        # - SBC:pl CHANGESUFFIX "rs" "r" g: 15 b : 0
        # - SBC:pl CHANGESUFFIX "s" "" g: 179 b : 18
        # => which one have to be chosen ?
        # The first means : go straight to human logic (try to create less error 
        # when applying rules)
        # The second : use your own logic (create error if it seems a good operation,
        # and create new rules to correct the new errors created)

        # Sorting using the rapport good / bad,
        # Giving advantage to the more numerous, if rapport is close.
        coeff = 0.1  # The lower the coeff, the stronger the rules with bad = 0
        g = lambda good, bad: ( float(good) / ( float(good) + float(bad) ) ) * ( ( float(good) / MAX ) ** coeff )
        try:
            return sorted([(r[0], g(r[1], r[2])) for r in rules if r[1] / max(r[2], 1) >= minval], key=itemgetter(1), reverse=True)[0]
        except IndexError:
            return None, None
        # Sorting with the difference between good and bad,
        # diving advantage to the less bad if same difference
        # Adding 0.1 to prevent from division by 0.
#        return sorted([(r[0], (r[1] - r[2] - (r[2] / (r[1] + 0.1)))) for r in rules], key=itemgetter(1), reverse=True)[0]
    
[docs]    def make_rules(self, token):
        comp = self.get_complement(token)
        if len(comp) > 0:
            return [self.compile_rule(token.tag, token.verified_tag, comp)]
        return []
    
[docs]    def test_complement(self, token, complement):#cache this
        return complement == self.get_complement(token)#cache this
    
[docs]    def is_candidate(self, token, rule):
        # Should we check if word is in lexicon and to_tag a possible tag for it?
        from_tag, to_tag, _, complement = self.uncompile_rule(rule)
        # We not always have a from_tag (eg. template hassuf)
        if (from_tag and token.tag != from_tag) or \
           not self.test_complement(token, complement):
            return False
        return True

[docs]class ContextualTemplateGenerator(type):
    
    register = dict()
    _loaded_rules = None
    
    def __new__(mcs, name, base, dict):
        theclass = type.__new__(mcs, name, base, dict)
        if name.isupper():
            ContextualTemplateGenerator.register[name] = theclass
        return theclass

    @classmethod
[docs]    def get_instance(cls, s, **kwargs):
        """
        Returns and instance of a rule, from a template name or a rule string.
        
        `s` can be template name or rule.
        """
        if s.count(" ") > 0:  # rule
            _, _, name, _ = ContextualBaseTemplate.uncompile_rule(s)
        else:
            name = s
        child_class = cls.register[name]
        return child_class.get_or_create(name, ContextualTemplateGenerator)

    @classmethod
[docs]    def export(cls, rules):
        """
        Export rules to the provisory config file.
        
        `rules` are tuples (rule, score).
        """
        save_to_file("corpus/contextual_rules.pdg", 
                     "\n".join(rule for rule, score in rules))

    @classmethod
[docs]    def load(cls):
        """
        Load rules from config file.
        """
        if cls._loaded_rules is None:
            log("Loading contextual rules...", "CYAN", True)
            lx = load_file("corpus/contextual_rules.rls")
            cls._loaded_rules = [r for r in lx.split(u"\n") if len(r) > 1]
        return cls._loaded_rules

[docs]class ContextualBaseTemplate(RuleTemplate):
    """
    Base class for the contextual rules.
    """
    __metaclass__ = ContextualTemplateGenerator
    _uncompiled_rules = {}
    
[docs]    def compile_rule(self, from_tag, to_tag, complement):
        """
        Make the final rule string.
        complement must be an iterable
        """
        comp = " ".join(unicode(c) for c in complement)
        return u"%s %s %s %s" % (from_tag, to_tag, self.__class__.__name__, comp)

    @classmethod
[docs]    def uncompile_rule(self, rule):
        try:
            return self._uncompiled_rules[rule]
        except KeyError: 
            els = rule.split()
            self._uncompiled_rules[rule] = els[0], els[1], els[2], els[3:]
            return self._uncompiled_rules[rule]
    
[docs]class LexicalTemplateGenerator(type):

    register = dict()
    _loaded_rules = None#caching

    def __new__(mcs, name, base, dict):
        theclass = type.__new__(mcs, name, base, dict)
        if name.islower():
            LexicalTemplateGenerator.register[name] = theclass
        return theclass

    @classmethod
[docs]    def get_instance(cls, s, lexicon):
        """
        s can be template name or rule.
        """
        if s.count(" ") > 0:#rule
            _, _, name, _ = LexicalBaseTemplate.uncompile_rule(s)
        else:
            name = s
        child_class = cls.register[name]
        return child_class.get_or_create(name, LexicalTemplateGenerator, lexicon=lexicon)
    
    @classmethod
[docs]    def export(cls, rules):
        """
        Rules are tuples (rule, score)
        """
        save_to_file("corpus/lexical_rules.pdg", 
                     "\n".join("%s\t%f" % (rule, float(score)) 
                     for rule, score 
                     in sorted(rules, key=itemgetter(1), reverse=True)))
    
    @classmethod
[docs]    def load(cls):
        if cls._loaded_rules is None:
            log("Loading lexical rules...", "CYAN", True)
            cls._loaded_rules = []
            lx = load_file("corpus/lexical_rules.rls")
            for line in lx.split(u"\n"):
                els = line.split(u"\t")
                if els[0] != u"":
                    cls._loaded_rules.append(els[0])
        return cls._loaded_rules

[docs]class LexicalBaseTemplate(RuleTemplate):
    """
    Base class for the lexical rules.
    """
    __metaclass__ = LexicalTemplateGenerator
    _uncompiled_rules = {}
    
    def __init__(self, pk, **kwargs):
        self.id = pk
        self.lexicon = kwargs["lexicon"]
        self.check_from_tag = self.__class__.__name__.startswith(u"f")
    
[docs]    def compile_rule(self, from_tag, to_tag, complement):
        if self.check_from_tag:
            return u"%s %s %s %d %s" % (from_tag, unicode(complement), self.__class__.__name__, len(complement), to_tag)
        else:
            return u"%s %s %d %s" % (unicode(complement), self.__class__.__name__, len(complement), to_tag)
    
    @classmethod
    def _uncompile_rule(cls, rule):
        els = rule.split()
        for el in els:
            if el in LexicalTemplateGenerator.register:#Here is classname
                if u"good" in el:#proximity structure
                    if el.startswith("f"):
                        from_tag, complement, classname, to_tag = els
                    else:
                        complement, classname, to_tag = els
                        from_tag = None
                else:#string check with 
                    if el.startswith("f"):
                        from_tag, complement, classname, _, to_tag = els
                    else:
                        complement, classname, _, to_tag = els
                        from_tag = None
                break
        cls._uncompiled_rules[rule] = from_tag, to_tag, classname, complement
    
    @classmethod
[docs]    def uncompile_rule(cls, rule):#cache this
        if not rule in cls._uncompiled_rules:
            cls._uncompile_rule(rule)
        return cls._uncompiled_rules[rule]
    
[docs]    def test_complement(self, token, complement):
        return complement in self.get_complement(token)

[docs]class ProximityCheckTemplate(LexicalBaseTemplate):
    
[docs]    def compile_rule(self, from_tag, to_tag, complement):
        """
        No len...
        """
        if self.check_from_tag:
            return u"%s %s %s %s" % (from_tag, unicode(complement), self.__class__.__name__, to_tag)
        else:
            return u"%s %s %s" % (unicode(complement), self.__class__.__name__, to_tag)

[docs]class NoLexiconCheckTemplate(LexicalBaseTemplate):
    
[docs]    def make_rules(self, token):
        final = []
        for affix in self.get_complement(token):
            to_tag = token.verified_tag#verified_tag
            from_tag = token.tag
            rule = self.compile_rule(from_tag, to_tag, affix)
            final.append(rule)
        return final

[docs]class LexiconCheckTemplate(LexicalBaseTemplate):
    """
    Base templates for those who have to check lexicon.
    """
    
[docs]    def make_rules(self, token):
        final = []
        for affix, ceased_tk in self.get_complement(token):
            if ceased_tk in self.lexicon:
                to_tag = token.verified_tag#verified_tag / or lexicon token tag ?
                if self.check_from_tag:
                    rule = u"%s %s %s %d %s" % (token.tag, unicode(affix), self.__class__.__name__, len(affix), to_tag)
                else:
                    rule = u"%s %s %d %s" % (unicode(affix), self.__class__.__name__, len(affix), to_tag)
                final.append(rule)
        return final
    
[docs]    def test_complement(self, token, complement):
        """
        For the Lexicon Check rules, we need to check if modified word is
        in lexicon.
        """
        return self.modified_token(token, complement) in self.lexicon
   

[docs]class deletesuf(LexiconCheckTemplate):
    """
    Change current tag to tag X, if removing suffix Y lead in a entry of the lexicon.
    """
    
[docs]    def get_complement(self, token):
        """
        Return a tuple of afix, ceased_token.
        """
        final = []
        tlen = len(token.original)
        for i in xrange(1, min(5 + 1, tlen)):
            affix = token.original[tlen - i:tlen]
            ceased_tk = token.original[:i]
            final.append((affix, ceased_tk))
        return final
    
[docs]    def test_complement(self, token, complement):
        """
        Test if token has the right suffix, and if deleting it result in a
        word in the lexicon
        """
        return token[-len(complement):] == complement and \
               token[:-len(complement)] in self.lexicon

[docs]class fdeletesuf(deletesuf):
    """
    Change current tag to tag X, if removing suffix Y lead in a entry of lexicon
    and if current tag is Z.
    """
    pass

[docs]class deletepref(LexiconCheckTemplate):
    """
    Change current tag to tag X, if removing prefix Y lead in a entry of the lexicon.
    
    Prefix Y lenght from 1 to 4 (Y < 4) : 
    Syntax : Y deletepref len(Y) X
    Ex. : re deletepref 2 VNCFF 
    """
    
[docs]    def get_complement(self, token):
        final = []
        tlen = len(token.original)
        for i in xrange(1, min(5, tlen)):
            ceased_tk = token.original[i:]
            affix = token.original[0:i]
            final.append((affix, ceased_tk))
        return final
    
[docs]    def test_complement(self, token, complement):
        """
        Tests if token has the right prefix, and if deleting it result in a
        word in the lexicon
        """
        return token[:len(complement)] == complement and \
               token[len(complement):] in self.lexicon

[docs]class fdeletepref(deletepref):
    """
    Change current tag to tag X, if removing prefix Y lead in a entry of the lexicon
    and if current tag is Z.
    
    Prefix Y lenght from 1 to 4 (Y < 4) : 
    Syntax : Z Y fdeletepref len(Y) X
    Ex. : ADV re fdeletepref 2 VNCFF 
    """
    pass

[docs]class addpref(LexiconCheckTemplate):
    """
    Change current tag to tag X, if adding prefix Y lead in a entry of the lexicon.
    
    Prefix Y lenght from 1 to 4 (Y < 4) : 
    Syntax : Y addpref len(Y) X
    Ex. : er addpref 2 VNCFF 
    """
    
[docs]    def get_complement(self, token):
        final = []
        if token.original in self.lexicon.factors:
            for affix in self.lexicon.prefixes:
                increased_tk = self.modified_token(token, affix)
                final.append((affix, increased_tk))
        return final
    
[docs]    def modified_token(self, token, complement):
        return complement + token.original

[docs]class faddpref(addpref):
    """
    Change current tag to tag X, if adding prefix Y lead in a entry of the lexicon
    and if current tag is Z.

    Prefix Y lenght from 1 to 4 (Y < 4) : 
    Syntax : Z Y faddpref len(Y) X
    Ex. : SBC:sg re faddpref 2 VNCFF 
    """
    pass

[docs]class addsuf(LexiconCheckTemplate):
    """
    Change current tag to tag X, if adding suffix Y lead in a entry of the lexicon.
    
    Suffix Y lenght from 1 to 4 (Y < 4) : 
    Syntax : Y addsuf len(Y) X
    Ex. : re addsuf 2 VNCFF 
    """
    
[docs]    def get_complement(self, token):
        final = []
        if token.original in self.lexicon.factors:
            for affix in self.lexicon.suffixes:
                increased_tk = self.modified_token(token, affix)
                final.append((affix, increased_tk))
        return final
    
[docs]    def modified_token(self, token, complement):
        return token.original + complement



[docs]class faddsuf(addsuf):
    """
    Change current tag to tag X, if removing prefix Y lead in a entry of the
    lexicon and current tag is Z.
    
    Suffix Y lenght from 1 to 4 (Y <= 4) : 
    Syntax : Z Y faddsuf len(Y) X
    Ex. : SBC:sg re faddsuf 2 VNCFF 
    """
    pass

[docs]class hassuf(NoLexiconCheckTemplate):
    """
    Change current tag to tag X, if suffix is Y.

    Suffix Y is length from 1 to 4 (y <= 4)    
    Syntax: Y hassuf len(Y) X
    Ex. : ment hassuf 4 ADV
    """    
[docs]    def get_complement(self, token):
        """
        Return a tuple of afix, ceased_token.
        """
        final = set()
        tlen = len(token.original)
        for i in xrange(1, min(5 + 1, tlen)):
            affix = token.original[tlen - i:tlen]
            if affix in self.lexicon.suffixes:
                final.add(affix)
        return final

[docs]class fhassuf(hassuf):
    """
    Change current tag to tag X, if suffix is Y and current tag is Z.

    Suffix Y is length from 1 to 4 (y <= 4)    
    Syntax: Z Y hassuf len(Y) X
    Ex. : SBC:sg ment hassuf 4 ADV
    """
    pass

[docs]class haspref(NoLexiconCheckTemplate):
    """
    Change current tag to tag X, if prefix is Y.

    Prefix Y is length from 1 to 4 (y <= 4)    
    Syntax: Z Y haspref len(Y) X
    Ex. : pro haspref 3 SBC:sg
    """
    
[docs]    def get_complement(self, token):
        final = []
        tlen = len(token.original)
        for i in xrange(1, min(5, tlen)):
            affix = token.original[0:i]
            if affix in self.lexicon.prefixes:
                final.append(affix)
        return final

[docs]class fhaspref(haspref):
    """
    Change current tag to tag X, if prefix is Y and current tag is Z.

    Prefix Y is length from 1 to 4 (y <= 4)    
    Syntax: Z Y hassuf len(Y) X
    Ex. : ADV bla haspref 3 DTC:sg
    """
    pass

[docs]class goodright(NoLexiconCheckTemplate, ProximityCheckTemplate):
    """
    The current word is at the right of the word X.
    """
[docs]    def get_complement(self, token):
        return [unicode(t.original) for t in token.get_neighbors(-1)]

[docs]class fgoodright(goodright):
    pass

[docs]class goodleft(NoLexiconCheckTemplate, ProximityCheckTemplate):
    """
    The current word is at the right of the word x.
    """
[docs]    def get_complement(self, token):
        return [unicode(t.original) for t in token.get_neighbors(1)]

[docs]class fgoodleft(goodleft):
    pass

[docs]class WordBasedTemplate(ContextualBaseTemplate):
    """
    Abstract Class for words based template.
    """
[docs]    def get_complement(self, token):
        args = self.get_target()
        return [unicode(e.original) for e in token.get_neighbors(*args)]
    
[docs]class TagBasedTemplate(ContextualBaseTemplate):
    """
    Abastract Class for tags based template.
    """
[docs]    def get_complement(self, token):
        args = self.get_target()
        return [e.verified_tag for e in token.get_neighbors(*args)]
    
[docs]class WordTagBasedTemplate(ContextualBaseTemplate):
    """
    Abastract Class for mixed based template : word, than tag.
    """
[docs]    def get_complement(self, token):
        args = self.get_target()
        nbors = token.get_neighbors(*args)  # must return empty []
        if nbors:
            return [unicode(nbors[0].original), nbors[1].verified_tag]
        else:
            return []
    
[docs]class TagWordBasedTemplate(ContextualBaseTemplate):
    """
    Abastract Class for mixed based template : tag, than word.
    """
[docs]    def get_complement(self, token):
        args = self.get_target()
        nbors = token.get_neighbors(*args)
        if nbors:
            return [nbors[0].verified_tag, unicode(nbors[1])]
        else:
            return []
    
[docs]class NEXTBIGRAM(WordBasedTemplate):
    """
    The next two words are X and Y.
    """
[docs]    def get_target(self):
        return 1, 2

[docs]class PREVBIGRAM(WordBasedTemplate):
    """
    The previous two words are X and Y.
    """
[docs]    def get_target(self):
        return -2, -1

[docs]class OrTemplate(ContextualBaseTemplate):
    """
    Abstract class for template where we check not specific position.
    """
[docs]    def test_complement(self, token, complement):
#        print complement, self.get_complement(token)
        return complement[0] in self.get_complement(token)
    
[docs]    def make_rules(self, token):
        nb = self.get_complement(token)
        final = []
        if len(nb) > 0:
            for w in nb:
                final += [self.compile_rule(token.tag, token.verified_tag, [w])]
        return final

[docs]class NEXT1OR2OR3TAG(OrTemplate, TagBasedTemplate):
    """
    One of the next three words is tagged X.
    """
[docs]    def get_target(self):
        return 1, 2, 3

[docs]class NEXT1OR2TAG(OrTemplate, TagBasedTemplate):
    """
    One of the next three token is tagged X.
    """
[docs]    def get_target(self):
        return 1, 2

[docs]class PREV1OR2OR3TAG(OrTemplate, TagBasedTemplate):
    """
    One of the next three token is tagged X.
    """
[docs]    def get_target(self):
        return -3, -2, -1

[docs]class PREV1OR2TAG(OrTemplate, TagBasedTemplate):
    """
    One of the next three token is tagged X.
    """
[docs]    def get_target(self):
        return -2, -1

[docs]class NEXTTAG(TagBasedTemplate):
    """
    The next token is tagged X.
    """
[docs]    def get_target(self):
        return (1,)

[docs]class NEXT2TAG(TagBasedTemplate):
    """
    The token after next token is tagged X.
    """
[docs]    def get_target(self):
        return (2,)

[docs]class PREVTAG(TagBasedTemplate):
    """
    The next token is tagged X.
    """
[docs]    def get_target(self):
        return (-1,)

[docs]class PREV2TAG(TagBasedTemplate):
    """
    The token after next token is tagged X.
    """
[docs]    def get_target(self):
        return (-2,)

[docs]class SURROUNDTAG(TagBasedTemplate):
    """
    The preceding word is tagged x and the following word is tagged y.
    """
[docs]    def get_target(self):
        return -1, 1

[docs]class NEXT1OR2WD(OrTemplate, WordBasedTemplate):
    """
    One of the next two token is word X.
    """
[docs]    def get_target(self):
        return 1, 2

[docs]class NEXT2WD(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (2,)

[docs]class NEXTWD(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (1,)

[docs]class CURWD(WordBasedTemplate):
    """
    The word is X.
    I have doubt on the interest of this template...
    """
[docs]    def get_target(self):
        return (0,)

[docs]class PREV1OR2WD(OrTemplate, WordBasedTemplate):
    """
    One of the next two token is word X.
    """
[docs]    def get_target(self):
        return -2, -1

[docs]class PREV2WD(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (-2,)

[docs]class PREVWD(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (-1,)

[docs]class WDAND2BFR(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (-2,0)

[docs]class WDAND2AFT(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (0,2)

[docs]class LBIGRAM(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (-1,0)

[docs]class RBIGRAM(WordBasedTemplate):
    """
    One of the next three token is word X.
    """
[docs]    def get_target(self):
        return (0,1)

[docs]class WDAND2TAGAFT(WordTagBasedTemplate):
    """
    Current word, and tag of two token after.
    """
[docs]    def get_target(self):
        return 0, 2

[docs]class WDAND2TAGBFR(TagWordBasedTemplate):
    """
    Current word, and tag of two token before.
    """
[docs]    def get_target(self):
        return -2, 0

[docs]class WDNEXTTAG(WordTagBasedTemplate):
    """
    Current word, and tag of token after.
    """
[docs]    def get_target(self):
        return 0, 1

[docs]class WDPREVTAG(TagWordBasedTemplate):
    """
    Current word, and tag of token before.
    """
[docs]    def get_target(self):
        return -1, 0

[docs]class LemmatizerTemplateGenerator(type):

    register = dict()
    _loaded_rules = None

    def __new__(mcs, name, base, dict):
        theclass = type.__new__(mcs, name, base, dict)
        if name.isupper():
            LemmatizerTemplateGenerator.register[name] = theclass
        return theclass

    @classmethod
[docs]    def get_instance(cls, s, **kwargs):
        """
        s can be template name or rule.
        """
        if s.count(" ") > 0:#rule
            name = s.split(" ")[1]
        else:
            name = s
        child_class = cls.register[name]
        return child_class.get_or_create(name, LemmatizerTemplateGenerator)
    
    @classmethod
[docs]    def export(cls, rules):
        """
        Rules are tuples (rule, score)
        """
        save_to_file("corpus/lemmatizer_rules.pdg", 
                     "\n".join("%s\t%f" % (rule, float(score)) 
                     for rule, score 
                     in rules) )
#                     in sorted(rules, key=itemgetter(1), reverse=True)))
    
    @classmethod
[docs]    def load(cls):
        if cls._loaded_rules is None:
            log("Loading lemmatizer rules...", "CYAN", True)
            lx = load_file("corpus/lemmatizer_rules.rls")
            cls._loaded_rules = []
            for line in lx.split(u"\n"):
                els = line.split(u"\t")
                if els[0] != u"":
                    cls._loaded_rules.append(els[0])
        return cls._loaded_rules

[docs]class LemmatizerBaseTemplate(RetrievableObject):
    """
    For the Lemmatizer training, the is just one template :
    it create as many possible rules as letters in the token tested.
    MAKELOWER
    GIVELEMME
    CHANGESUFFIX
    """
    __metaclass__ = LemmatizerTemplateGenerator
    
    def __init__(self, pk, **kwargs):
        self.id = pk
    
[docs]    def make_rules(self, token):
        pass#Must be overwrited
    
[docs]    def compile_rule(self):
        pass
    
[docs]    def test_rule(self, token, rule):
        pass
    
[docs]    def is_candidate(self, token, rule):
        from_tag = self.uncompile_rule(rule)[0]
        return token.tag == from_tag
    
[docs]    def uncompile_rule(self, rule):
        return rule.split(" ")
    
    def __unicode__(self):
        return u"<%s %s>" % (self.__class__.__name__, self.id)

[docs]class MAKELOWER(LemmatizerBaseTemplate):
    """
    Make the original lower, if the tag is x.
    """
[docs]    def make_rules(self, token):
        if token.lemme[0].isupper():
            return [self.compile_rule(token.tag)]
        else:
            return []
    
[docs]    def compile_rule(self, tag):
        return '''%s %s''' % (tag, self.__class__.__name__)
    
#    def is_candidate(self, token, rule):
#        tag, _ = self.uncompile_rule(rule)
#        #the token have the right tag for rule
#        return token.tag == tag\
#               and token[0].isupper()#the first letter is upper
    
[docs]    def test_rule(self, token, rule):
#        print token, token.lemme, token.verified_lemme, rule, token.tag
        if not self.is_candidate(token, rule): return 0
        elif token.verified_lemme[0] == token.lemme[0].lower(): return 1
        else: return -1
    
[docs]    def apply_rule(self, tokens, rule):
        for token in tokens:
            if self.is_candidate(token, rule):
                token.lemme = token.lemme.lower()
                # Maybe we should do this only in training mode
                token.sample.reset_trainer_status()

[docs]class CHANGESUFFIX(LemmatizerBaseTemplate):
    """
    Make the original lower, if the tag is x.
    """
[docs]    def make_rules(self, token):
        """
        We make one rule for each possible transformation making verified_lemme
        from token.original.
        """
        final_rules = set()
        for i in xrange(1, len(token) + 1):
            suffix = token.lemme[-i:]
            stem = token.lemme[:-i]
            if token.verified_lemme[:len(stem)] == stem:#potential rule
                final_rules.add(self.compile_rule(token.tag, suffix, token.verified_lemme[len(stem):]))
        return final_rules
    
[docs]    def compile_rule(self, tag, to_delete, to_add):
        return '''%s %s "%s" "%s"''' % (tag, self.__class__.__name__, to_delete, to_add)
    
[docs]    def uncompile_rule(self, rule):
        els = rule.split(" ")
        return els[0], els[2][1:-1], els[3][1:-1]
    
[docs]    def is_candidate(self, token, rule):
        tag, to_delete, to_add = self.uncompile_rule(rule)
        #the token have the right tag for rule
        return token.tag == tag\
               and token.lemme[-len(to_delete):] == to_delete#the suffix is the rule one
    
[docs]    def test_rule(self, token, rule):
        tag, to_delete, to_add = self.uncompile_rule(rule)
        if not self.is_candidate(token, rule): return 0
        elif token.verified_lemme == token.lemme[:-len(to_delete)] + to_add:
            return 1
        else: return -1
    
[docs]    def apply_rule(self, tokens, rule):
        tag, to_delete, to_add = self.uncompile_rule(rule)
        for token in tokens:
            if self.is_candidate(token, rule):
                token.lemme = token.lemme[:-len(to_delete)] + to_add
                # Maybe we should do this only in training mode
                token.sample.reset_trainer_status()

[docs]class FORCELEMME(LemmatizerBaseTemplate):
    """
    Give lemme y, if the tag is x.
    """
[docs]    def make_rules(self, token):
        return [self.compile_rule(token.tag, token.verified_lemme)]
    
[docs]    def compile_rule(self, tag, lemme):
        return '''%s %s %s''' % (tag, self.__class__.__name__, lemme)
    
[docs]    def is_candidate(self, token, rule):
        tag, _, lemme = self.uncompile_rule(rule)
        return token.tag == tag#the token have the right tag for rule
    
[docs]    def test_rule(self, token, rule):
        tag, _, lemme = self.uncompile_rule(rule)
        if not self.is_candidate(token, rule): return 0
        elif token.verified_lemme == lemme: return 1
        else: return -1
    
[docs]    def apply_rule(self, tokens, rule):
        tag, _, lemme = self.uncompile_rule(rule)
        for token in tokens:
            if self.is_candidate(token, rule):
                token.lemme = lemme
                # Maybe we should do this only in training mode
                token.sample.reset_trainer_status()
Navigation

Source code for sulci.rules_templates

Project Versions

RTD Search

Quick search

Navigation