Source code for sulci.thesaurus

# -*- coding:Utf-8 -*-

import re
import codecs
import os

from django.db import models, transaction
from django.db.utils import IntegrityError

from sulci.textutils import tokenize_text, lev
from sulci.base import RetrievableObject
from sulci.utils import save_to_file, get_dir

[docs]class Thesaurus(object): def __init__(self, path="thesaurus.txt"): self.descriptors = Descriptor.objects.all() def __contains__(self, item): """ TODO. """ try: d = self[item] return True except Descriptor.DoesNotExist: return False def __iter__(self): return self.descriptors.__iter__() def __getitem__(self, key): return Descriptor.objects.get(name=unicode(key))
[docs] def normalize_item(self, item): from textmining import KeyEntity#Sucks... if isinstance(item, KeyEntity): tup = tuple([unicode(t) for t in item.stemms]) elif isinstance(item, list): tup = tuple(item) elif isinstance(item, (unicode, str)): tup = tuple(item.split()) else: tup = item return tuple(sorted(tup))
@property
[docs] def triggers(self): if self._triggers is None:#cached and lazy self._triggers = set() self.load_triggers() return self._triggers
[docs] def load_triggers(self): sulci_logger.debug("Loading triggers...", "YELLOW", True) f = codecs.open(get_dir() + "corpus/triggers.trg", "r", "utf-8") for idx, line in enumerate(f.readlines()): #TODO check line validity t, created = Trigger.get_or_create(line, self, parent=self, original=line) self._triggers.add(t) f.close()
@classmethod
[docs] def reset_triggers(self): """ For full training, we need to remove previous triggers. """ save_to_file("corpus/triggers.trg", "")
[docs]class Descriptor(models.Model): """ Entries of the Thesaurus. """ parent = models.ForeignKey('self', blank=True, null=True, related_name="children") name = models.CharField(max_length=200, db_index=True) description = models.TextField(blank=True, null=True) is_alias_of = models.ForeignKey('self', blank=True, null=True, related_name="aliases", help_text="If this descriptor is an alias of another." ) def __init__(self, *args, **kwargs): self._max_weight = None super(Descriptor, self).__init__(*args, **kwargs) @property
[docs] def original(self): # Retrocompatibility return self.name
def __unicode__(self): return unicode(self.original) @property
[docs] def max_weight(self): if self._max_weight is None: # Thread cache try: #Ordered by -weight by default self._max_weight = self.triggertodescriptor_set.all()[0].weight except TriggerToDescriptor.DoesNotExist: # Should not occur. self._max_weight = 0 return self._max_weight
@property
[docs] def primeval(self): """ Returns the primeval descriptor when self is alias of another. """ if self.is_alias_of is None: return self return self.is_alias_of.primeval
[docs]class TriggerToDescriptor(models.Model): """ This is the "synapse" of the trigger to descriptor relation. """ descriptor = models.ForeignKey(Descriptor, db_index=True) trigger = models.ForeignKey("Trigger", db_index=True) weight = models.FloatField(default=0, db_index=True) @property
[docs] def pondered_weight(self): """ Give the weight of the relation, relative to the max weight of the trigger and the max weight of the descriptor. """ # current weigth relative to trigger max weight weight = self.weight / self.trigger.max_weight # current weight relative to descriptor max weight weight *= self.weight / self.descriptor.max_weight # # current weight relative to trigger count # # we use logarithm to limit negative impact for very common triggers # weight *= math.log(self.weight) / math.log(self.trigger.count) # # current weight relative to descriptor occurrences in training # # Using log to limit impact # weight *= \ # math.log(self.weight) / math.log(self.descriptor.trained_occurrences) return weight
class Meta: unique_together = ("descriptor", "trigger") ordering = ["-weight"] def __unicode__(self): return u"%s =[%f]=> %s" % (self.trigger, self.weight, self.descriptor)
[docs]class Trigger(models.Model): """ The trigger is a keyentity who suggest some descriptors when in a text. It is linked to one or more descriptors, and the distance of the link between the trigger and a descriptor is stored in the relation. This score is populated during the sementical training. """ original = models.CharField(max_length=500, db_index=True, unique=True) count = models.IntegerField(default=0,blank=True) descriptors = models.ManyToManyField("Descriptor", through="TriggerToDescriptor", blank=True, null=True) def __init__(self, *args, **kwargs): self._max_weight = None # We cache relatins to descriptors. But during training, some other processes # could create and modify relations. This is a potential source of # bad behaviour, but at the moment I prefer to have good performance # cause I launch very often the script for testing it... self._cached_descriptors = None self._cached_synapses = None super(Trigger, self).__init__(*args, **kwargs) # self.id = pk#Tuple of original string # self.original = u" ".join(pk) # self.parent = kwargs["parent"] # self._descriptors = {} # self.init_descriptors(**kwargs) @property def _descriptors(self): if self._cached_descriptors is None: self._cached_descriptors = list(self.descriptors.all()) return self._cached_descriptors @property def _synapses(self): if self._cached_synapses is None: self._cached_synapses = list(self.triggertodescriptor_set.select_related().all()[:20]) return self._cached_synapses def __unicode__(self): return unicode(self.original) def __contains__(self, key): return key in self._descriptors def __setitem__(self, key, value): if not isinstance(key, Descriptor): raise ValueError("Key must be Descriptor instance, got %s (%s) instead" % (str(key), type(key))) # Flush descriptors cache self._cached_descriptors = None # As we cache, and some other process could have created the # relation between this trigger and this descriptor # we catch IntegrityErrors. Maybe a get_or_create should do the job ? try: return TriggerToDescriptor.objects.get_or_create(descriptor=key, trigger=self, weight=value) except IntegrityError: # Another process has created the relation. # It we return self[key], we get a DatabaseError from psycho... # I've tried a transaction.rollback(), but got an error too. pass def __getitem__(self, key): return TriggerToDescriptor.objects.get(descriptor=key, trigger=self) # def __delitem__(self, key): # return self._descriptors.__delitem__(key) def __iter__(self): return self._synapses.__iter__() # Django call the __len__ method for every related model when using # select_related... # def __len__(self): # return len(self._descriptors)
[docs] def items(self): return self._descriptors
def __hash__(self): return self.original.__hash__() @property
[docs] def max_weight(self): if self._max_weight is None: # Thread cache try: #Ordered by -weight by default self._max_weight = self.triggertodescriptor_set.all().only('weight')[0].weight except TriggerToDescriptor.DoesNotExist: # Should not occur. self._max_weight = 0 return self._max_weight # return max(self[d.descriptor].weight for d in self) # def init_descriptors(self, **kwargs): # """ # Take a text descriptors storage and create the links. # """ # #original may be the full orginal line # if "original" in kwargs: # for d in kwargs["original"].split("\t")[1:]:#TODO check errors # ds = d.split() # original = ds[:-1] # dsc, created = Descriptor.get_or_create(original, self.parent, original=original) # self.connect(dsc, float(ds[-1]))
[docs] def connect(self, descriptor, score): """ Create a connection with the descriptor if doesn't yet exists. In each case, update the connection weight. Delete the connection if the score is negative. """ if not descriptor in self: # sulci_logger.debug(u"Creating connection %s - %s" % (self, descriptor), "CYAN") self[descriptor] = 0.0 rel = self[descriptor] rel.weight += score rel.save() # if self[descriptor] < 0: # del self[descriptor] # sulci_logger.debug(u"Removed connection %s - %s" % (self, descriptor), "RED")
[docs] def clean_connections(self): """ Remove the negative connections. """ for descriptor in self._descriptors.copy().__iter__(): if self[descriptor] < 0: del self[descriptor] sulci_logger.debug(u"Removed connection %s - %s" % (self, descriptor), "RED")
@classmethod
[docs] def clean_all_connections(cls): TriggerToDescriptor.objects.filter(weight__lte=0).delete()
[docs] def export(self): """ Return a string for file storage. """ if len(self) == 0: sulci_logger.debug(u"No descriptors for %s" % unicode(self), "RED") return None return u"%s\t%s" % (unicode(self), u"\t".join(u"%s %f" % (unicode(k), float(v)) for k, v in self.items()))

Project Versions