Source code for sulci.lemmatizer

#!/usr/bin/env python
# -*- coding:Utf-8 -*-

from sulci.base import TextManager
from sulci.log import sulci_logger
from sulci.rules_templates import LemmatizerTemplateGenerator

[docs]class Lemmatizer(TextManager): """ This class give a lemma for a token, using his tag. """ PATH = "corpus" VALID_EXT = ".lem.crp" def __init__(self, lexicon): self._tokens = None self._raw_content = None self._len = None self.lexicon = lexicon def __len__(self): if self._len is None: self._len = len(self.tokens) return self._len @property
[docs] def content(self): if self._raw_content is None: self._raw_content = "" self.load_valid_files() self._raw_content = self._raw_content.replace("\n", " ").replace(" ", " ") return self._raw_content
@property
[docs] def tokens(self): if self._tokens is None: sulci_logger.info("Loading Lemmatizer corpus...", "GREEN", True) self._samples, self._tokens = self.instantiate_text(self.content.split()) return self._tokens
@property
[docs] def samples(self): if self._samples is None: self.tokens # Load tokens and samples return self._samples
[docs] def do(self, token): """ A Token object or a list of token objects is expected. Return the token or the list. """ tks = hasattr(token, "__iter__") and token or [token] rules = LemmatizerTemplateGenerator.load() # Cache me for rule in rules: template, _ = LemmatizerTemplateGenerator.get_instance(rule) template.apply_rule(tks, rule) # We force lemme if word is in lexicon with the current POS tag for tk in tks: if tk in self.lexicon and tk.tag in self.lexicon[tk]: tk.lemme = self.lexicon[tk][tk.tag] return hasattr(token, "__iter__") and tks or tks[0]

Project Versions