import collections
from logging import getLogger
from typing import List, Dict, Optional, TYPE_CHECKING
from pyknp import Argument as PyknpArgument
from pyknp import Morpheme, Tag
from pyknp_eventgraph.builder import Builder
from pyknp_eventgraph.component import Component
from pyknp_eventgraph.helper import PAS_ORDER, convert_katakana_to_hiragana
from pyknp_eventgraph.base_phrase import BasePhrase
from pyknp_eventgraph.helper import convert_mrphs_to_surf
if TYPE_CHECKING:
from pyknp_eventgraph.pas import PAS
logger = getLogger(__name__)
[docs]class Argument(Component):
"""An argument supplements its predicate's information.
Attributes:
pas (PAS): A PAS that this argument belongs.
case (str): A case.
eid (int): An entity ID.
flag (str): A flag.
sdist (int): The sentence distance between this argument and the predicate.
arg (:class:`pyknp.knp.pas.Argument`, optional): An Argument object in pyknp.
head_base_phrase (Token, optional): A head basic phrase.
"""
def __init__(self, pas: 'PAS', case: str, eid: int, flag: str, sdist: int, arg: Optional[PyknpArgument] = None):
self.pas: 'PAS' = pas
self.case: str = case
self.eid: int = eid
self.flag: str = flag
self.sdist: int = sdist
self.arg: Optional[PyknpArgument] = arg
self.head_base_phrase: Optional[BasePhrase] = None
self._surf = None
self._normalized_surf = None
self._mrphs = None
self._normalized_mrphs = None
self._reps = None
self._normalized_reps = None
self._head_reps = None
self._children = None
self._adnominal_event_ids = None
self._sentential_complement_event_ids = None
@property
def tag(self) -> Optional[Tag]:
"""The tag of the head base phrase."""
return self.head_base_phrase.tag
@property
def surf(self) -> str:
"""A surface string."""
if self._surf is None:
self._surf = convert_mrphs_to_surf(self.mrphs)
return self._surf
@property
def normalized_surf(self) -> str:
"""A normalized surface string."""
if self._normalized_surf is None:
self._normalized_surf = convert_mrphs_to_surf(self.normalized_mrphs)
return self._normalized_surf
@property
def mrphs(self) -> str:
"""A tokenized surface string."""
if self._mrphs is None:
self._mrphs = self._base_phrase_to_text(self.head_base_phrase, truncate=False, include_modifiees=True)
return self._mrphs
@property
def normalized_mrphs(self) -> str:
"""A tokenized/normalized surface string."""
if self._normalized_mrphs is None:
self._normalized_mrphs = self._base_phrase_to_text(
self.head_base_phrase,
truncate=True,
include_modifiees=True
)
return self._normalized_mrphs
@property
def reps(self) -> str:
"""A representative string."""
if self._reps is None:
self._reps = self._base_phrase_to_text(
self.head_base_phrase,
mode='reps',
truncate=False,
include_modifiees=True
)
return self._reps
@property
def normalized_reps(self) -> str:
"""A normalized representative string."""
if self._normalized_reps is None:
self._normalized_reps = self._base_phrase_to_text(
self.head_base_phrase,
mode='reps',
truncate=True,
include_modifiees=True
)
return self._normalized_reps
@property
def head_reps(self) -> str:
"""A head representative string."""
if self._head_reps is None:
if self.head_base_phrase.tag: # Not an exophora.
head_reps = self.head_base_phrase.tag.head_prime_repname or self.head_base_phrase.tag.head_repname
if head_reps:
self._head_reps = f'[{head_reps}]' if self.head_base_phrase.omitted_case else head_reps
self._head_reps = self._head_reps or self.normalized_reps
return self._head_reps
@property
def adnominal_event_ids(self) -> List[int]:
"""A list of IDs of events modifying this predicate (adnominal)."""
if self._adnominal_event_ids is None:
self._adnominal_event_ids = sorted(
event.evid for t in self.head_base_phrase.modifiees(include_self=True)
for event in t.adnominal_events
)
return self._adnominal_event_ids
@property
def sentential_complement_event_ids(self) -> List[int]:
"""A list of IDs of events modifying this predicate (sentential complement)."""
if self._sentential_complement_event_ids is None:
self._sentential_complement_event_ids = sorted(
event.evid for t in self.head_base_phrase.modifiees(include_self=True)
for event in t.sentential_complement_events
)
return self._sentential_complement_event_ids
@property
def children(self) -> List[dict]:
"""A list of child words."""
if self._children is None:
self._children = []
for bp in reversed(self.head_base_phrase.modifiers()):
self._children.append({
'surf': convert_mrphs_to_surf(self._base_phrase_to_text(bp, mode='mrphs', truncate=False)),
'normalized_surf': convert_mrphs_to_surf(
self._base_phrase_to_text(bp, mode='mrphs', truncate=True)
),
'mrphs': self._base_phrase_to_text(bp, mode='mrphs', truncate=False),
'normalized_mrphs': self._base_phrase_to_text(bp, mode='mrphs', truncate=True),
'reps': self._base_phrase_to_text(bp, mode='reps', truncate=False),
'normalized_reps': self._base_phrase_to_text(bp, mode='reps', truncate=True),
'adnominal_event_ids': [e.evid for e in bp.adnominal_events],
'sentential_complement_event_ids': [e.evid for e in bp.sentential_complement_events],
'modifier': '修飾' in bp.tag.features,
'possessive': bp.tag.features.get('係', '') == 'ノ格',
})
return self._children
def _base_phrase_to_text(
self,
bp: BasePhrase,
mode: str = 'mrphs',
truncate: bool = False,
include_modifiees: bool = False
) -> str:
"""Convert a base phrase to a text.
Args:
bp: A base phrase.
mode: A type of token representation, which can take either "mrphs" or "reps".
truncate: If true, adjunct words are truncated.
include_modifiees: If true, parents are used to construct a compound phrase.
"""
assert mode in {'mrphs', 'reps'}
if bp.omitted_case:
if bp.exophora:
base = bp.exophora
else:
mrphs = self._truncate_mrphs(list(bp.tag.mrph_list()))
base = self._format_mrphs(mrphs, mode, normalize=True)
case = convert_katakana_to_hiragana(self.case)
case = case if mode == 'mrphs' else f'{case}/{case}'
return f'[{base}]' if truncate else f'[{base} {case}]'
else:
mrphs = list(bp.tag.mrph_list())
if include_modifiees:
for parent_base_phrase in bp.modifiees():
mrphs += (parent_base_phrase.tag.mrph_list())
if truncate:
mrphs = self._truncate_mrphs(mrphs)
return self._format_mrphs(mrphs, mode, normalize=True)
else:
return self._format_mrphs(mrphs, mode, normalize=False)
@staticmethod
def _truncate_mrphs(mrphs: List[Morpheme]) -> List[Morpheme]:
"""Truncate a list of morphemes.
Args:
mrphs: A list of morphemes.
"""
content_mrphs = []
seen_content_word = False
for mrph in mrphs:
is_content_word = mrph.hinsi not in {'助詞', '特殊', '判定詞'}
if not is_content_word and seen_content_word:
break
seen_content_word = seen_content_word or is_content_word
content_mrphs.append(mrph)
return content_mrphs
@staticmethod
def _format_mrphs(mrphs: List[Morpheme], mode: str, normalize: bool = False) -> str:
"""Convert a list of morphemes to a text.
Args:
mrphs: A list of morphemes.
mode: A type of token representation, which can take either "mrphs" or "reps".
normalize: If true, the last content word will be normalized.
"""
assert mode in {'mrphs', 'reps'}
if mode == 'reps':
return ' '.join(mrph.repname or f'{mrph.midasi}/{mrph.midasi}' for mrph in mrphs)
else:
if normalize:
# Change the last morpheme to its infinitive (i.e., genkei).
# Strip the return string for the case that len(mrphs) == 1.
return (' '.join(mrph.midasi for mrph in mrphs[:-1]) + ' ' + mrphs[-1].genkei).strip()
else:
return ' '.join(mrph.midasi for mrph in mrphs)
[docs] def to_dict(self) -> dict:
"""Convert this object into a dictionary."""
return dict(
surf=self.surf,
normalized_surf=self.normalized_surf,
mrphs=self.mrphs,
normalized_mrphs=self.normalized_mrphs,
reps=self.reps,
normalized_reps=self.normalized_reps,
head_reps=self.head_reps,
eid=self.eid,
flag=self.flag,
sdist=self.sdist,
adnominal_event_ids=self.adnominal_event_ids,
sentential_complement_event_ids=self.sentential_complement_event_ids,
children=self.children
)
[docs] def to_string(self) -> str:
"""Convert this object into a string."""
return f'<Argument, case: {self.case}, surf: {self.surf}>'
class ArgumentBuilder(Builder):
def __call__(self, pas: 'PAS', case: str, arg: PyknpArgument) -> Argument:
logger.debug('Create an argument')
argument = Argument(pas, case, arg.eid, arg.flag, arg.sdist, arg)
pas.arguments[case].append(argument)
logger.debug('Successfully created an argument.')
return argument
class JsonArgumentBuilder(Builder):
def __call__(self, pas: 'PAS', case: str, dump: dict) -> Argument:
logger.debug('Create an argument')
argument = Argument(pas, case, dump['eid'], dump['flag'], dump['sdist'])
argument._surf = dump['surf']
argument._normalized_surf = dump['normalized_surf']
argument._mrphs = dump['mrphs']
argument._normalized_mrphs = dump['normalized_mrphs']
argument._reps = dump['reps']
argument._normalized_reps = dump['normalized_reps']
argument._head_reps = dump['head_reps']
argument._children = dump['children']
argument._adnominal_event_ids = dump['adnominal_event_ids']
argument._sentential_complement_event_ids = dump['sentential_complement_event_ids']
pas.arguments[case].append(argument)
logger.debug('Successfully created an argument.')
return argument
class ArgumentsBuilder(Builder):
def __call__(self, pas: 'PAS') -> Dict[str, List[Argument]]:
arguments: Dict[str, List[Argument]] = collections.defaultdict(list)
if pas.pas:
for case, args in sorted(pas.pas.arguments.items(), key=lambda x: PAS_ORDER.get(x[0], 99)):
for arg in sorted(args, key=lambda _arg: (pas.ssid - _arg.sdist, _arg.tid)):
arguments[case].append(ArgumentBuilder()(pas, case, arg))
return arguments
class JsonArgumentsBuilder(Builder):
def __call__(self, pas: 'PAS', dump: dict) -> Dict[str, List[Argument]]:
arguments: Dict[str, List[Argument]] = collections.defaultdict(list)
for case, arguments_dump in dump.items():
for argument_dump in arguments_dump:
arguments[case].append(JsonArgumentBuilder()(pas, case, argument_dump))
return arguments