Source code for pyknp_eventgraph.predicate

from logging import getLogger
from typing import List, Optional, TYPE_CHECKING

from pyknp import Tag, Morpheme

from pyknp_eventgraph.builder import Builder
from pyknp_eventgraph.component import Component
from pyknp_eventgraph.base_phrase import BasePhrase
from pyknp_eventgraph.helper import convert_mrphs_to_surf

if TYPE_CHECKING:
    from pyknp_eventgraph.pas import PAS

logger = getLogger(__name__)


[docs]class Predicate(Component): """A predicate is the core of a PAS. Attributes: pas (PAS): A PAS that this predicate belongs. head (:class:`pyknp.knp.tag.Tag`): A head tag. type_ (str): A type of this predicate. head_base_phrase (Token, optional): A head basic phrase. """ def __init__(self, pas: 'PAS', type_: str, head: Optional[Tag] = None): self.pas: PAS = pas self.type_: str = type_ self.head: Optional[Tag] = head self.head_base_phrase: Optional[BasePhrase] = None self._surf = None self._normalized_surf = None self._mrphs = None self._normalized_mrphs = None self._reps = None self._normalized_reps = None self._standard_reps = None self._children = None self._adnominal_event_ids = None self._sentential_complement_event_ids = None @property def tag(self) -> Optional[Tag]: """The tag of the head base phrase.""" return self.head_base_phrase.tag @property def surf(self) -> str: """A surface string.""" if self._surf is None: self._surf = convert_mrphs_to_surf(self.mrphs) return self._surf @property def normalized_surf(self) -> str: """A normalized surface string.""" return self.surf @property def mrphs(self) -> str: """A tokenized string.""" if self._mrphs is None: mrphs = [] is_within_standard_repname = False for bp in self.head_base_phrase.modifiees(include_self=True): for m in bp.tag.mrph_list(): if '用言表記先頭' in m.fstring: is_within_standard_repname = True if '用言表記末尾' in m.fstring: mrphs.append(m.genkei) # Normalize the last morpheme. return ' '.join(mrphs) if is_within_standard_repname: mrphs.append(m.midasi) self._mrphs = ' '.join(mrphs) return self._mrphs @property def normalized_mrphs(self) -> str: """A tokenized/normalized surface string.""" return self.mrphs @property def reps(self) -> str: """A representative string.""" if self._reps is None: for bp in self.head_base_phrase.modifiees(include_self=True): if '用言代表表記' in bp.tag.features: self._reps = bp.tag.features['用言代表表記'] break else: self._reps = self._base_phrase_to_text( self.head_base_phrase, mode='reps', truncate=True, include_modifiees=True ) return self._reps @property def normalized_reps(self) -> str: """A normalized representative string.""" return self.reps @property def standard_reps(self) -> str: """A standard representative string.""" if self._standard_reps is None: for bp in self.head_base_phrase.modifiees(include_self=True): if '標準用言代表表記' in bp.tag.features: self._standard_reps = bp.tag.features['標準用言代表表記'] break else: self._standard_reps = self.reps return self._standard_reps @property def type(self) -> str: """The type of this predicate.""" return self.type_ @property def adnominal_event_ids(self) -> List[int]: """A list of IDs of events modifying this predicate (adnominal).""" if self._adnominal_event_ids is None: self._adnominal_event_ids = sorted( event.evid for t in self.head_base_phrase.modifiees(include_self=True) for event in t.adnominal_events ) return self._adnominal_event_ids @property def sentential_complement_event_ids(self) -> List[int]: """A list of IDs of events modifying this predicate (sentential complement).""" if self._sentential_complement_event_ids is None: self._sentential_complement_event_ids = sorted( event.evid for t in self.head_base_phrase.modifiees(include_self=True) for event in t.sentential_complement_events ) return self._sentential_complement_event_ids @property def children(self) -> List[dict]: """A list of child words.""" if self._children is None: self._children = [] for bp in reversed(self.head_base_phrase.modifiers()): self._children.append({ 'surf': convert_mrphs_to_surf(self._base_phrase_to_text(bp, mode='mrphs', truncate=False)), 'normalized_surf': convert_mrphs_to_surf( self._base_phrase_to_text(bp, mode='mrphs', truncate=True) ), 'mrphs': self._base_phrase_to_text(bp, mode='mrphs', truncate=False), 'normalized_mrphs': self._base_phrase_to_text(bp, mode='mrphs', truncate=True), 'reps': self._base_phrase_to_text(bp, mode='reps', truncate=False), 'normalized_reps': self._base_phrase_to_text(bp, mode='reps', truncate=True), 'adnominal_event_ids': [event.evid for event in bp.adnominal_events], 'sentential_complement_event_ids': [event.evid for event in bp.sentential_complement_events], 'modifier': '修飾' in bp.tag.features, 'possessive': bp.tag.features.get('係', '') == 'ノ格', }) return self._children def _base_phrase_to_text(self, bp: BasePhrase, mode: str = 'mrphs', truncate: bool = False, include_modifiees: bool = False) -> str: """Convert a base phrase to a text. Args: bp: A base phrase. mode: A type of token representation, which can take either "mrphs" or "reps". truncate: If true, adjunct words are truncated. include_modifiees: If true, parents are used to construct a compound phrase. """ assert mode in {'mrphs', 'reps'} mrphs = list(bp.tag.mrph_list()) if include_modifiees: for parent_bp in bp.modifiees(): mrphs += list(parent_bp.tag.mrph_list()) if truncate: mrphs = self._truncate_mrphs(mrphs) return self._format_mrphs(mrphs, mode, normalize=True) else: return self._format_mrphs(mrphs, mode, normalize=False) @staticmethod def _truncate_mrphs(mrphs: List[Morpheme]) -> List[Morpheme]: """Truncate a list of morphemes. Args: mrphs: A list of morphemes. """ for index, mrph in reversed(list(enumerate(mrphs))): if mrph.hinsi == '助動詞' and mrph.genkei == 'です' and 0 < index and mrphs[index - 1].hinsi == '形容詞': # adjective + 'です' -> ignore 'です' (e.g., 美しいです -> 美しい) return mrphs[:index] elif mrph.hinsi == '判定詞' and mrph.midasi == 'じゃ' and 0 < index and '<活用語>' in mrphs[index - 1].fstring: # adjective or verb +'じゃん' -> ignore 'じゃん' (e.g., 使えないじゃん -> 使えない) return mrphs[:index] elif ('<活用語>' in mrph.fstring or '<用言意味表記末尾>' in mrph.fstring) and mrph.genkei not in {'のだ', 'んだ'}: # check the last word with conjugation except some meaningless words return mrphs[:index + 1] return mrphs @staticmethod def _format_mrphs(mrphs: List[Morpheme], mode: str, normalize: bool = False) -> str: """Convert a list of morphemes to a text. Args: mrphs: A list of morphemes. mode: A type of token representation, which can take either "mrphs" or "reps". normalize: If true, the last content word will be normalized. """ assert mode in {'mrphs', 'reps'} if mode == 'reps': return ' '.join(mrph.repname or f'{mrph.midasi}/{mrph.midasi}' for mrph in mrphs) else: # i.e., mode == 'mrphs' if normalize: # Change the last morpheme to its infinitive (i.e., genkei) base = ' '.join(mrph.midasi for mrph in mrphs[:-1]) if mrphs[-1].hinsi == '助動詞' and mrphs[-1].genkei == 'ぬ': # Exception to prevent transforming "できません" into "できませぬ". return f'{base} {mrphs[-1].midasi}'.strip() else: return f'{base} {mrphs[-1].genkei}'.strip() else: return ' '.join(mrph.midasi for mrph in mrphs)
[docs] def to_dict(self) -> dict: """Convert this object into a dictionary.""" return dict( surf=self.surf, normalized_surf=self.normalized_surf, mrphs=self.mrphs, normalized_mrphs=self.normalized_mrphs, reps=self.reps, normalized_reps=self.normalized_reps, standard_reps=self.standard_reps, type=self.type, adnominal_event_ids=self.adnominal_event_ids, sentential_complement_event_ids=self.sentential_complement_event_ids, children=self.children )
[docs] def to_string(self) -> str: """Convert this object into a string.""" return f'<Predicate, type: {self.type_}, surf: {self.surf}>'
class PredicateBuilder(Builder): def __call__(self, pas: 'PAS') -> Predicate: logger.debug('Create a predicate.') predicate = Predicate(pas, self._find_type(pas.event.head), pas.event.head) pas.predicate = predicate logger.debug('Successfully created a predicate.') return predicate @staticmethod def _find_type(head: Tag) -> str: return head.features.get('用言', '') class JsonPredicateBuilder(Builder): def __call__(self, pas: 'PAS', dump: dict) -> Predicate: logger.debug('Create a predicate.') predicate = Predicate(pas, dump['type']) predicate._surf = dump['surf'] predicate._normalized_surf = dump['normalized_surf'] predicate._mrphs = dump['mrphs'] predicate._normalized_mrphs = dump['normalized_mrphs'] predicate._reps = dump['reps'] predicate._normalized_reps = dump['normalized_reps'] predicate._standard_reps = dump['standard_reps'] predicate._children = dump['children'] predicate._adnominal_event_ids = dump['adnominal_event_ids'] predicate._sentential_complement_event_ids = dump['sentential_complement_event_ids'] pas.predicate = predicate logger.debug('Successfully created a predicate.') return predicate