Source code for gchar.resources.pixiv.tag

import re
import warnings
from itertools import chain
from typing import Iterable, Iterator, Union, List, Tuple, Type, Mapping, Optional

from .keyword import _load_pixiv_names_for_game, _load_pixiv_alias_for_game
from ...games import get_character
from ...games.base import Character
from ...utils import optional_lru_cache


def _yield_tags(tags: Union[Tuple[str], List[str], str]) -> Iterator[str]:
    """
    Yield individual tags from a string or a list of tags.

    :param tags: The tags to yield.
    :type tags: Union[Tuple[str], List[str], str]
    :returns: An iterator of individual tags.
    :rtype: Iterator[str]
    """
    if isinstance(tags, str):
        for item in re.split(r'\s+', tags):
            if item:
                yield item
    elif isinstance(tags, (list, tuple, set)):
        for item in tags:
            yield from _yield_tags(item)


def _format_tags(positive, negative, or_clause=None):
    """
    Format the positive and negative tags into a valid Pixiv search tag string.

    :param positive: The positive tags.
    :type positive: Union[Tuple[str], List[str], str]
    :param negative: The negative tags.
    :type negative: Union[Tuple[str], List[str], str]
    :param or_clause: The OR clause tags.
    :type or_clause: Optional[Union[Tuple[str], List[str], str]]
    :returns: The formatted Pixiv search tag string.
    :rtype: str
    """
    positive_tags = list(_yield_tags(positive))
    negative_tags = list(_yield_tags(negative))
    or_clause_tags = list(_yield_tags(or_clause or ''))
    if len(or_clause_tags) == 1:
        or_tag = or_clause_tags[0]
        if or_tag not in positive_tags:
            positive_tags.append(or_tag)
        or_clause_tags = []

    positive_tags = set(positive_tags)
    or_clause_tags = set(or_clause_tags)
    negative_tags = set(negative_tags) - positive_tags - or_clause_tags
    positive_tags = sorted(positive_tags)
    negative_tags = sorted(negative_tags)
    or_clause_tags = sorted(or_clause_tags)

    all_phrases = [*positive_tags]
    if or_clause_tags:
        all_phrases.append(f'({" OR ".join(or_clause_tags)})')
    if negative_tags:
        all_phrases.extend((f'-{x}' for x in negative_tags))

    return ' '.join(all_phrases)


PIXIV_TAG_MAX_LENGTH = 256


[docs]class PixivCharPool:
    def __init__(self, chars: Iterable[Character],
                 names_dict: Mapping[str, Tuple[int, float, List[Tuple[str, int]]]],
                 names_alias: Mapping[Union[str, int], List[str]]):
        """
        Initialize the Pixiv character pool.

        :param chars: The iterable of Character instances.
        :type chars: Iterable[Character]
        :param names_dict: The mapping of character names to their Pixiv data.
        :type names_dict: Mapping[str, Tuple[int, float, List[Tuple[str, int]]]]
        :param names_alias: The mapping of character IDs or names to their Pixiv aliases.
        :type names_alias: Mapping[Union[str, int], List[str]]
        """
        self.__chars = list(chars)
        self.__names_dict = names_dict
        self.__names_alias = names_alias
        self.__all_names = sorted(
            set(self.__names_dict.keys()) |
            set(chain(*self.__names_alias.values()))
        )

    def __get_name_item(self, name) -> Optional[Tuple[int, float, List[Tuple[str, int]]]]:
        """
        Get the Pixiv data for a specific character name.

        :param name: The character name.
        :type name: str
        :returns: The Pixiv data tuple.
        :rtype: Optional[Tuple[int, float, List[Tuple[str, int]]]]
        """
        return self.__names_dict.get(name, None)

    def __get_name_count(self, name) -> int:
        """
        Get the count of illustrations for a specific character name.

        :param name: The character name.
        :type name: str
        :returns: The count of illustrations.
        :rtype: int
        """
        tpl = self.__get_name_item(name)
        if tpl:
            count, _, _ = tpl
            return count
        else:
            return 0

    def __get_name_pollution_ratio(self, name) -> float:
        """
        Get the pollution ratio for a specific character name.

        :param name: The character name.
        :type name: str
        :returns: The pollution ratio.
        :rtype: float
        """
        tpl = self.__get_name_item(name)
        if tpl:
            _, ratio, _ = tpl
            return ratio
        else:
            return 0.0

    def __get_name_pollution_words(self, name) -> List[Tuple[str, int]]:
        """
        Get the pollution words and their counts for a specific character name.

        :param name: The character name.
        :type name: str
        :returns: A list of pollution word-count tuples.
        :rtype: List[Tuple[str, int]]
        """
        tpl = self.__get_name_item(name)
        if tpl:
            _, _, pollution = tpl
            return pollution
        else:
            return []

    def _iter_dup_names(self, name: str) -> Iterator[str]:
        """
        Iterate over duplicate names that contain the given name.

        :param name: The name to search for.
        :type name: str
        :returns: An iterator of duplicate names.
        :rtype: Iterator[str]
        """
        for sname in self.__all_names:
            if name != sname and name in sname:
                yield sname

[docs]    def get_tag(self, char: Character, use_english: bool = False,
                positive: Optional[List[str]] = None, negative: Optional[List[str]] = None,
                max_exclude_per_word: int = 20, max_exclude: int = 20, max_pollution_ratio: float = 0.8,
                max_length: int = PIXIV_TAG_MAX_LENGTH):
        """
        Generate a Pixiv search tag for a specific character.

        :param char: The character instance or name.
        :type char: Union[Character, str]
        :param use_english: Whether to use English names in the tag.
        :type use_english: bool
        :param positive: The positive tags to include.
        :type positive: Optional[List[str]]
        :param negative: The negative tags to exclude.
        :type negative: Optional[List[str]
        :param max_exclude_per_word: The maximum number of excluded tags per word.
        :type max_exclude_per_word: int
        :param max_exclude: The maximum number of excluded tags.
        :type max_exclude: int
        :param max_pollution_ratio: The maximum pollution ratio for including tags.
        :type max_pollution_ratio: float
        :param max_length: The maximum length of the generated tag.
        :type max_length: int
        :returns: The generated Pixiv search tag.
        :rtype: str
        """
        if not isinstance(char, Character):
            raise TypeError(f'Invalid character type - {char!r}.')

        char_names = [*char.cnnames, *char.jpnames]
        if use_english:
            char_names.extend(char.ennames)
        if char.index in self.__names_alias:
            char_names.extend(self.__names_alias[char.index])

        char_names = sorted(set(map(lambda x: str(x).lower(), char_names)))

        origin_positive = positive
        origin_negative = negative
        positive = set(_yield_tags(positive or []))
        negative = set(_yield_tags(negative or [])) - positive
        or_clause = set()

        exclude_names = set()
        exclude_name_pairs = []
        min_pollution = 1.0
        for chname in char_names:
            s_chname = str(chname)
            name_pollution_ratio = self.__get_name_pollution_ratio(s_chname)
            min_pollution = min(name_pollution_ratio, min_pollution)
            all_exnames = list(self._iter_dup_names(s_chname))
            if len(char_names) == 1 or \
                    (name_pollution_ratio <= max_pollution_ratio and len(all_exnames) <= max_exclude_per_word):
                or_clause.add(s_chname)

                for pword, pcnt in self.__get_name_pollution_words(s_chname):
                    if pword not in positive and pword not in or_clause and \
                            pword != char and pword not in exclude_names:
                        exclude_names.add(pword)
                        exclude_name_pairs.append((pword, pcnt, 1))

                for exname in all_exnames:
                    if exname not in positive and exname not in or_clause and \
                            exname != char and exname not in exclude_names:
                        exclude_names.add(exname)
                        exclude_name_pairs.append((exname, self.__get_name_count(exname), 0))

        if or_clause:
            exclude_name_pairs = sorted(exclude_name_pairs, key=lambda x: (x[2], -x[1], len(x[0]), x[0]))[:max_exclude]

            while True:
                current_negative = set(negative)
                for exname, _, _ in exclude_name_pairs:
                    current_negative.add(exname)

                ret_keyword = _format_tags(positive, current_negative, or_clause)
                if len(ret_keyword) > max_length:
                    exclude_name_pairs = exclude_name_pairs[:-1]
                else:
                    return ret_keyword
        else:
            return self.get_tag(
                char, use_english, origin_positive, origin_negative,
                max_exclude_per_word, max_exclude,
                max_pollution_ratio=min_pollution + 0.015,
            )

    def _iter_end_dup_names(self, name: str) -> Iterator[str]:
        """
        Iterate over names that end with the given name.

        :param name: The name to search for.
        :type name: str
        :returns: An iterator of names.
        :rtype: Iterator[str]
        """
        for sname in self.__all_names:
            if name != sname and sname.endswith(name):
                yield sname

[docs]    def get_simple_tag(self, char: Character, base_tag: str, max_exclude: int = 20):
        """
        Generate a simplified Pixiv search tag for a specific character.

        :param char: The character instance.
        :type char: Character
        :param base_tag: The base tag to append to the character name.
        :type base_tag: str
        :param max_exclude: The maximum number of excluded tags.
        :type max_exclude: int
        :returns: The generated simplified Pixiv search tag.
        :rtype: str
        """
        if not isinstance(char, Character):
            raise TypeError(f'Invalid character type - {char!r}.')

        positive = set()
        negative = set()
        or_clause = set()
        if char.jpnames:
            exclude_names = set()
            for jpname in char.jpnames:
                if base_tag:
                    positive.add(f'{jpname}({base_tag})')
                else:
                    positive.add(jpname)
                for exname in self._iter_end_dup_names(str(jpname)):
                    exclude_names.add(exname)

            for exname in sorted(exclude_names, key=lambda x: self.__get_name_count(x), reverse=True)[:max_exclude]:
                negative.add(exname)

        else:
            raise ValueError(f'Japanese name not found for character - {char!r}.')

        return _format_tags(positive, negative, or_clause)


@optional_lru_cache()
def _get_char_pool(cls: Type[Character], **kwargs):
    """
    Get the Pixiv character pool for a specific character class.

    :param cls: The character class.
    :type cls: Type[Character]
    :returns: The PixivCharPool instance.
    :rtype: PixivCharPool
    """
    names_dict = _load_pixiv_names_for_game(cls)
    names_alias = _load_pixiv_alias_for_game(cls)
    return PixivCharPool(cls.all(**kwargs), names_dict, names_alias)


[docs]def get_pixiv_keywords(char: Union[Character, str], simple: bool = False, use_english: bool = True,
                       includes: Optional[List[str]] = None, exclude: Optional[List[str]] = None,
                       allow_fuzzy: bool = True, fuzzy_threshold: int = 70, max_exclude: int = 20,
                       max_pollution_ratio: float = 0.8, max_length: int = PIXIV_TAG_MAX_LENGTH, **kwargs):
    """
    Get the Pixiv search keywords for a specific character.

    :param char: The character instance or name.
    :type char: Union[Character, str]
    :param simple: Whether to generate a simplified tag.
    :type simple: bool
    :param use_english: Whether to use English names in the tag.
    :type use_english: bool
    :param includes: The positive tags to include.
    :type includes: Optional[Union[Tuple[str], List[str], str]]
    :param exclude: The negative tags to exclude.
    :type exclude: Optional[Union[Tuple[str], List[str], str]]
    :param allow_fuzzy: Whether to allow fuzzy matching of character names.
    :type allow_fuzzy: bool
    :param fuzzy_threshold: The threshold for fuzzy matching.
    :type fuzzy_threshold: int
    :param max_exclude: The maximum number of excluded tags.
    :type max_exclude: int
    :param max_pollution_ratio: The maximum pollution ratio for including tags.
    :type max_pollution_ratio: float
    :param max_length: The maximum length of the generated tag.
    :type max_length: int
    :returns: The generated Pixiv search keywords.
    :rtype: str
    :raises ValueError: If the character is unknown.

    Examples::
        >>> from gchar.resources.pixiv import get_pixiv_keywords
        >>>
        >>> get_pixiv_keywords('amiya')
        'アークナイツ (amiya OR アーミヤ OR 阿米娅)'
        >>> get_pixiv_keywords('surtr')
        'アークナイツ (surtr OR スルト OR 史尔特尔) -明日方舟スルト'
        >>> get_pixiv_keywords('dusk')  # ケルシー and ドロシー way cause search noises
        'アークナイツ (dusk OR シー OR 夕) -ケルシー -シージ -シーン -ドロシー -ルーシー -夕張 -夕日 -夕焼け'
    """
    original_char = char
    if not isinstance(char, Character):
        char = get_character(char, allow_fuzzy, fuzzy_threshold, **kwargs)
    if not char:
        raise ValueError(f'Unknown character - {original_char!r}.')
    if max_length > PIXIV_TAG_MAX_LENGTH:
        warnings.warn(UserWarning(f'The maximum length pixiv supports is {PIXIV_TAG_MAX_LENGTH}, '
                                  f'but {max_length!r} is given. '
                                  f'This may result in no search results.'), stacklevel=2)

    pool = _get_char_pool(type(char), **kwargs)
    game_tag = char.__pixiv_keyword__
    base_tag = char.__pixiv_suffix__

    try:
        if simple:
            return pool.get_simple_tag(char, base_tag, max_exclude=max_exclude)
    except ValueError:
        warnings.warn(UserWarning(f'No japanese name for {char!r}, falling back to full tag.'), stacklevel=2)

    pos = [*(includes or [])]
    if game_tag:
        pos.append(game_tag)
    return pool.get_tag(char, use_english, positive=pos, negative=exclude,
                        max_exclude=max_exclude, max_pollution_ratio=max_pollution_ratio, max_length=max_length)