Source code for sulci.corpus

#!/usr/bin/env python
# -*- coding:Utf-8 -*-

import os

from collections import defaultdict
from operator import itemgetter

from django.utils.text import unescape_entities

from sulci.textutils import normalize_text
from sulci.utils import load_file, save_to_file, get_dir
from sulci.base import TextManager
from sulci.lemmatizer import Lemmatizer
from sulci.log import sulci_logger

[docs]class CorpusMonitor(object): """ Convenience class to store common methors between Corpus and TextCorpus. """
[docs] def check_usage(self, word=None, tag=None, lemme=None, case_insensitive=False): """ Find occurrences of a word or tag or both in the corpus loaded. """ if not any((word, tag, lemme)): raise ValueError("You must specify at least a word, a tag or a lemme") found = False for t in self: # If a specific word is asked if word: original = t.original if case_insensitive: word = word.lower() original = original.lower() if not word == original: continue # If a specific tag is asked if tag and not tag == t.verified_tag: continue # don't care about texts without lemmes, when a lemme is asked if lemme: if not t.sample.parent.has_verified_lemmes: continue if not lemme == t.verified_lemme: continue sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW") sulci_logger.info(t.show_context(), "WHITE") found = True if not found: not_found = u'No occurrence found for' if word: not_found += " %s" % word if tag: not_found += " %s" % tag sulci_logger.info(not_found, "RED")
[docs] def tags_stats(self, word=None, case_insensitive=None): """ Display tags usage stats. """ d = defaultdict(int) for t in self: if word: original = t.original if case_insensitive: word = word.lower() original = original.lower() if not word == original: continue if t.verified_tag == None: sulci_logger.info(u"No verified tag for %s" % unicode(t), "RED", True) d[t.verified_tag] += 1 log = u"Tag usage :" if word: log = u"Tag usage for word '%s'" % word sulci_logger.info(log, "WHITE") for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True): sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
[docs] def check(self, lexicon, check_lemmes=False): """ Check the text of the corpus, and try to determine if there are some errors. Compare with lexicon. """ sulci_logger.info(u"Checking text %s" % self.path, "YELLOW") found = False for t in self: if t in lexicon: # Check that current tag is in lexicon # If not, it *could* be an error, we display it if not t.verified_tag in lexicon[t]: sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \ % (unicode(t), t.verified_tag), "RED") sulci_logger.info(u"In Lexicon : %s" % lexicon[t]) sulci_logger.info(u"Context : %s" % t.show_context(), "MAGENTA") found = True if check_lemmes: if t.verified_tag in lexicon[t] \ and t.verified_lemme != lexicon[t][t.verified_tag]: sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \ % (unicode(t), t.verified_lemme), "BLUE") sulci_logger.info(u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY") sulci_logger.info(u"Context : %s" % t.show_context(), "YELLOW") found = True if not found: sulci_logger.info(u"No error found", "YELLOW")
[docs]class Corpus(CorpusMonitor): """ The corpus is a collection of manualy categorised texts. We have different kind of categorised texts : - .crp => just POS tag - .lem... => also manualy lemmatized - .lcx... => will be used to make the Lexicon When loading a Corpus, you'll need to specify the kind of texts to load. """ PATH = "corpus" VALID_EXT = ".crp" PENDING_EXT = ".pdg" NEW_EXT = ".new" LEXICON_EXT = ".lxc" def __init__(self, extension=VALID_EXT, tagger=None): """ You can force a tagger. Extension will be used to load the category of manually tagged files. """ self.tagger = tagger self._raw_content = "" self.extension = extension self._tokens = None self._samples = None self._texts = None @property
[docs] def files(self): """ Return a list of files for the corpus extension. """ return [x for x in os.listdir(get_dir(__file__) + self.PATH) \ if x.endswith(self.extension)]
@property
[docs] def texts(self): if self._texts is None: self._texts = [] for f in self.files: t = TextCorpus(os.path.join(self.PATH, f)) self._texts.append(t) return self._texts
@property
[docs] def tokens(self): if self._tokens is None: self._tokens = [] for corpus_text in self.texts: self._tokens += corpus_text.tokens return self._tokens
@property
[docs] def samples(self): if self._samples is None: self._samples = [] for corpus_text in self.texts: self._samples += corpus_text.samples return self._samples
def __iter__(self): return self.tokens.__iter__() def __len__(self): return self.tokens.__len__()
[docs]class TextCorpus(TextManager, CorpusMonitor): """ One single text of the corpus. This is not a raw text, but a manualy categorized text. The normalisation is : word/TAG/lemme word2/TAG2/lemme2, etc. """ PATH = "corpus" VALID_EXT = ".crp" PENDING_EXT = ".pdg" LEXICON_EXT = ".lxc.lem.crp" def __init__(self, path=None): """ Load a text, given a path. The path is optionnal, because content can be loaded from the prepare method. """ self.path = path self.content = "" if path: self.load() self._tokens = None self._samples = None
[docs] def load(self): self.content = load_file(self.path)
@property
[docs] def tokens(self): if self._tokens is None: self._samples, self._tokens = self.instantiate_text(self.content.split()) return self._tokens
@property
[docs] def samples(self): if self._samples is None: self.tokens # Load tokens and samples return self._samples
def __iter__(self): return self.tokens.__iter__() def __len__(self): return self.tokens.__len__() def __unicode__(self): return self.path
[docs] def prepare(self, text, tagger, lemmatizer): """ Given a raw text, clean it, and make tokens and samples. (Maybe this method should be in the TextManager class.) """ text = normalize_text(text) tokenized_text = self.tokenize(text) self._samples, self._tokens = self.instantiate_text(tokenized_text) tagger.tag_all(self.tokens) lemmatizer.do(self.tokens)
[docs] def export(self, name, force=False, add_lemmes=False): """ Export tokens in a file. force for export in the valid extension, otherwise it use the pending. """ self.content = "" for sample in self.samples: for token in sample: lemme = "" if add_lemmes: # Add lemme only if different from original if token.lemme != token.original: lemme = u"/%s" % token.lemme self.content += u"%s/%s%s " % (unicode(token.original), token.tag, lemme) self.content += u"\n" # Carriage return on each sample, for human reading # Define extention ext = self.PENDING_EXT if force: if add_lemmes: ext = self.LEXICON_EXT else: ext = self.VALID_EXT save_to_file(os.path.join(self.PATH, "%s%s" % (name, ext)), self.content)
@property
[docs] def has_verified_lemmes(self): """ Returns True if the text is supposed to contains verified lemmes. """ return self.path.endswith(self.LEXICON_EXT)

Project Versions