Source code for gchar.resources.pixiv.tag

import re
import warnings
from itertools import chain
from typing import Iterable, Iterator, Union, List, Tuple, Type, Mapping, Optional

from .keyword import _load_pixiv_names_for_game, _load_pixiv_alias_for_game
from ...games import get_character
from ...games.base import Character
from ...utils import optional_lru_cache


def _yield_tags(tags: Union[Tuple[str], List[str], str]) -> Iterator[str]:
    """
    Yield individual tags from a string or a list of tags.

    :param tags: The tags to yield.
    :type tags: Union[Tuple[str], List[str], str]
    :returns: An iterator of individual tags.
    :rtype: Iterator[str]
    """
    if isinstance(tags, str):
        for item in re.split(r'\s+', tags):
            if item:
                yield item
    elif isinstance(tags, (list, tuple, set)):
        for item in tags:
            yield from _yield_tags(item)


def _format_tags(positive, negative, or_clause=None):
    """
    Format the positive and negative tags into a valid Pixiv search tag string.

    :param positive: The positive tags.
    :type positive: Union[Tuple[str], List[str], str]
    :param negative: The negative tags.
    :type negative: Union[Tuple[str], List[str], str]
    :param or_clause: The OR clause tags.
    :type or_clause: Optional[Union[Tuple[str], List[str], str]]
    :returns: The formatted Pixiv search tag string.
    :rtype: str
    """
    positive_tags = list(_yield_tags(positive))
    negative_tags = list(_yield_tags(negative))
    or_clause_tags = list(_yield_tags(or_clause or ''))
    if len(or_clause_tags) == 1:
        or_tag = or_clause_tags[0]
        if or_tag not in positive_tags:
            positive_tags.append(or_tag)
        or_clause_tags = []

    positive_tags = set(positive_tags)
    or_clause_tags = set(or_clause_tags)
    negative_tags = set(negative_tags) - positive_tags - or_clause_tags
    positive_tags = sorted(positive_tags)
    negative_tags = sorted(negative_tags)
    or_clause_tags = sorted(or_clause_tags)

    all_phrases = [*positive_tags]
    if or_clause_tags:
        all_phrases.append(f'({" OR ".join(or_clause_tags)})')
    if negative_tags:
        all_phrases.extend((f'-{x}' for x in negative_tags))

    return ' '.join(all_phrases)


PIXIV_TAG_MAX_LENGTH = 256


[docs]class PixivCharPool: def __init__(self, chars: Iterable[Character], names_dict: Mapping[str, Tuple[int, float, List[Tuple[str, int]]]], names_alias: Mapping[Union[str, int], List[str]]): """ Initialize the Pixiv character pool. :param chars: The iterable of Character instances. :type chars: Iterable[Character] :param names_dict: The mapping of character names to their Pixiv data. :type names_dict: Mapping[str, Tuple[int, float, List[Tuple[str, int]]]] :param names_alias: The mapping of character IDs or names to their Pixiv aliases. :type names_alias: Mapping[Union[str, int], List[str]] """ self.__chars = list(chars) self.__names_dict = names_dict self.__names_alias = names_alias self.__all_names = sorted( set(self.__names_dict.keys()) | set(chain(*self.__names_alias.values())) ) def __get_name_item(self, name) -> Optional[Tuple[int, float, List[Tuple[str, int]]]]: """ Get the Pixiv data for a specific character name. :param name: The character name. :type name: str :returns: The Pixiv data tuple. :rtype: Optional[Tuple[int, float, List[Tuple[str, int]]]] """ return self.__names_dict.get(name, None) def __get_name_count(self, name) -> int: """ Get the count of illustrations for a specific character name. :param name: The character name. :type name: str :returns: The count of illustrations. :rtype: int """ tpl = self.__get_name_item(name) if tpl: count, _, _ = tpl return count else: return 0 def __get_name_pollution_ratio(self, name) -> float: """ Get the pollution ratio for a specific character name. :param name: The character name. :type name: str :returns: The pollution ratio. :rtype: float """ tpl = self.__get_name_item(name) if tpl: _, ratio, _ = tpl return ratio else: return 0.0 def __get_name_pollution_words(self, name) -> List[Tuple[str, int]]: """ Get the pollution words and their counts for a specific character name. :param name: The character name. :type name: str :returns: A list of pollution word-count tuples. :rtype: List[Tuple[str, int]] """ tpl = self.__get_name_item(name) if tpl: _, _, pollution = tpl return pollution else: return [] def _iter_dup_names(self, name: str) -> Iterator[str]: """ Iterate over duplicate names that contain the given name. :param name: The name to search for. :type name: str :returns: An iterator of duplicate names. :rtype: Iterator[str] """ for sname in self.__all_names: if name != sname and name in sname: yield sname
[docs] def get_tag(self, char: Character, use_english: bool = False, positive: Optional[List[str]] = None, negative: Optional[List[str]] = None, max_exclude_per_word: int = 20, max_exclude: int = 20, max_pollution_ratio: float = 0.8, max_length: int = PIXIV_TAG_MAX_LENGTH): """ Generate a Pixiv search tag for a specific character. :param char: The character instance or name. :type char: Union[Character, str] :param use_english: Whether to use English names in the tag. :type use_english: bool :param positive: The positive tags to include. :type positive: Optional[List[str]] :param negative: The negative tags to exclude. :type negative: Optional[List[str] :param max_exclude_per_word: The maximum number of excluded tags per word. :type max_exclude_per_word: int :param max_exclude: The maximum number of excluded tags. :type max_exclude: int :param max_pollution_ratio: The maximum pollution ratio for including tags. :type max_pollution_ratio: float :param max_length: The maximum length of the generated tag. :type max_length: int :returns: The generated Pixiv search tag. :rtype: str """ if not isinstance(char, Character): raise TypeError(f'Invalid character type - {char!r}.') char_names = [*char.cnnames, *char.jpnames] if use_english: char_names.extend(char.ennames) if char.index in self.__names_alias: char_names.extend(self.__names_alias[char.index]) char_names = sorted(set(map(lambda x: str(x).lower(), char_names))) origin_positive = positive origin_negative = negative positive = set(_yield_tags(positive or [])) negative = set(_yield_tags(negative or [])) - positive or_clause = set() exclude_names = set() exclude_name_pairs = [] min_pollution = 1.0 for chname in char_names: s_chname = str(chname) name_pollution_ratio = self.__get_name_pollution_ratio(s_chname) min_pollution = min(name_pollution_ratio, min_pollution) all_exnames = list(self._iter_dup_names(s_chname)) if len(char_names) == 1 or \ (name_pollution_ratio <= max_pollution_ratio and len(all_exnames) <= max_exclude_per_word): or_clause.add(s_chname) for pword, pcnt in self.__get_name_pollution_words(s_chname): if pword not in positive and pword not in or_clause and \ pword != char and pword not in exclude_names: exclude_names.add(pword) exclude_name_pairs.append((pword, pcnt, 1)) for exname in all_exnames: if exname not in positive and exname not in or_clause and \ exname != char and exname not in exclude_names: exclude_names.add(exname) exclude_name_pairs.append((exname, self.__get_name_count(exname), 0)) if or_clause: exclude_name_pairs = sorted(exclude_name_pairs, key=lambda x: (x[2], -x[1], len(x[0]), x[0]))[:max_exclude] while True: current_negative = set(negative) for exname, _, _ in exclude_name_pairs: current_negative.add(exname) ret_keyword = _format_tags(positive, current_negative, or_clause) if len(ret_keyword) > max_length: exclude_name_pairs = exclude_name_pairs[:-1] else: return ret_keyword else: return self.get_tag( char, use_english, origin_positive, origin_negative, max_exclude_per_word, max_exclude, max_pollution_ratio=min_pollution + 0.015, )
def _iter_end_dup_names(self, name: str) -> Iterator[str]: """ Iterate over names that end with the given name. :param name: The name to search for. :type name: str :returns: An iterator of names. :rtype: Iterator[str] """ for sname in self.__all_names: if name != sname and sname.endswith(name): yield sname
[docs] def get_simple_tag(self, char: Character, base_tag: str, max_exclude: int = 20): """ Generate a simplified Pixiv search tag for a specific character. :param char: The character instance. :type char: Character :param base_tag: The base tag to append to the character name. :type base_tag: str :param max_exclude: The maximum number of excluded tags. :type max_exclude: int :returns: The generated simplified Pixiv search tag. :rtype: str """ if not isinstance(char, Character): raise TypeError(f'Invalid character type - {char!r}.') positive = set() negative = set() or_clause = set() if char.jpnames: exclude_names = set() for jpname in char.jpnames: if base_tag: positive.add(f'{jpname}({base_tag})') else: positive.add(jpname) for exname in self._iter_end_dup_names(str(jpname)): exclude_names.add(exname) for exname in sorted(exclude_names, key=lambda x: self.__get_name_count(x), reverse=True)[:max_exclude]: negative.add(exname) else: raise ValueError(f'Japanese name not found for character - {char!r}.') return _format_tags(positive, negative, or_clause)
@optional_lru_cache() def _get_char_pool(cls: Type[Character], **kwargs): """ Get the Pixiv character pool for a specific character class. :param cls: The character class. :type cls: Type[Character] :returns: The PixivCharPool instance. :rtype: PixivCharPool """ names_dict = _load_pixiv_names_for_game(cls) names_alias = _load_pixiv_alias_for_game(cls) return PixivCharPool(cls.all(**kwargs), names_dict, names_alias)
[docs]def get_pixiv_keywords(char: Union[Character, str], simple: bool = False, use_english: bool = True, includes: Optional[List[str]] = None, exclude: Optional[List[str]] = None, allow_fuzzy: bool = True, fuzzy_threshold: int = 70, max_exclude: int = 20, max_pollution_ratio: float = 0.8, max_length: int = PIXIV_TAG_MAX_LENGTH, **kwargs): """ Get the Pixiv search keywords for a specific character. :param char: The character instance or name. :type char: Union[Character, str] :param simple: Whether to generate a simplified tag. :type simple: bool :param use_english: Whether to use English names in the tag. :type use_english: bool :param includes: The positive tags to include. :type includes: Optional[Union[Tuple[str], List[str], str]] :param exclude: The negative tags to exclude. :type exclude: Optional[Union[Tuple[str], List[str], str]] :param allow_fuzzy: Whether to allow fuzzy matching of character names. :type allow_fuzzy: bool :param fuzzy_threshold: The threshold for fuzzy matching. :type fuzzy_threshold: int :param max_exclude: The maximum number of excluded tags. :type max_exclude: int :param max_pollution_ratio: The maximum pollution ratio for including tags. :type max_pollution_ratio: float :param max_length: The maximum length of the generated tag. :type max_length: int :returns: The generated Pixiv search keywords. :rtype: str :raises ValueError: If the character is unknown. Examples:: >>> from gchar.resources.pixiv import get_pixiv_keywords >>> >>> get_pixiv_keywords('amiya') 'アークナイツ (amiya OR アーミヤ OR 阿米娅)' >>> get_pixiv_keywords('surtr') 'アークナイツ (surtr OR スルト OR 史尔特尔) -明日方舟スルト' >>> get_pixiv_keywords('dusk') # ケルシー and ドロシー way cause search noises 'アークナイツ (dusk OR シー OR 夕) -ケルシー -シージ -シーン -ドロシー -ルーシー -夕張 -夕日 -夕焼け' """ original_char = char if not isinstance(char, Character): char = get_character(char, allow_fuzzy, fuzzy_threshold, **kwargs) if not char: raise ValueError(f'Unknown character - {original_char!r}.') if max_length > PIXIV_TAG_MAX_LENGTH: warnings.warn(UserWarning(f'The maximum length pixiv supports is {PIXIV_TAG_MAX_LENGTH}, ' f'but {max_length!r} is given. ' f'This may result in no search results.'), stacklevel=2) pool = _get_char_pool(type(char), **kwargs) game_tag = char.__pixiv_keyword__ base_tag = char.__pixiv_suffix__ try: if simple: return pool.get_simple_tag(char, base_tag, max_exclude=max_exclude) except ValueError: warnings.warn(UserWarning(f'No japanese name for {char!r}, falling back to full tag.'), stacklevel=2) pos = [*(includes or [])] if game_tag: pos.append(game_tag) return pool.get_tag(char, use_english, positive=pos, negative=exclude, max_exclude=max_exclude, max_pollution_ratio=max_pollution_ratio, max_length=max_length)