Source code for rantanplan.core

#!/usr/bin/python
# Based on previous work done by Rafael C. Carrasco, José A. Mañas
# (Communications of the ACM 30(7), 1987) and Javier Sober
# https://github.com/postdataproject/skas-archived/blob/devel/skas/phonmet/syll/grapheme2syllable.py
#
# Presyllabification and syllabification rules are taken from
# Antonio Ríos Mestre's 'El Diccionario Electrónico Fonético del Español'
# https://www.raco.cat/index.php/Elies/article/view/194843
# http://elies.rediris.es/elies4/Fon2.htm
# http://elies.rediris.es/elies4/Fon8.htm
import copy
import re
from collections import Counter
from itertools import product

from spacy.tokens import Doc

from .pipeline import load_pipeline
from .rhymes import analyze_rhyme
from .schemes import scheme
from .structures import STRUCTURES_LENGTH
from .syllabification import ALTERNATIVE_SYLLABIFICATION
from .syllabification import CONSONANT_CLUSTER_RE
from .syllabification import CONSONANT_GROUP
from .syllabification import CONSONANT_GROUP_EXCEPTION_DL
from .syllabification import CONSONANT_GROUP_EXCEPTION_LL
from .syllabification import HIATUS_FIRST_VOWEL_RE
from .syllabification import LIAISON_FIRST_PART
from .syllabification import LIAISON_SECOND_PART
from .syllabification import LOWERING_DIPHTHONGS_WITH_H
from .syllabification import POSSESSIVE_PRON_UNSTRESSED
from .syllabification import PREFIX_DES_WITH_CONSONANT_RE
from .syllabification import PREFIX_SIN_WITH_CONSONANT_RE
from .syllabification import RAISING_DIPHTHONGS_WITH_H
from .syllabification import SPACE
from .syllabification import STRESSED_PRON
from .syllabification import STRESSED_UNACCENTED_MONOSYLLABLES
from .syllabification import STRESSED_WEAK_VOWELS
from .syllabification import STRONG_VOWELS
from .syllabification import SYLLABIFICATOR_FOREIGN_WORDS_DICT
from .syllabification import UNSTRESSED_FORMS
from .syllabification import UNSTRESSED_UNACCENTED_MONOSYLLABLES
from .syllabification import W_VOWEL_GROUP
from .syllabification import WEAK_VOWELS
from .syllabification import accents_re
from .syllabification import letter_clusters_re
from .syllabification import paroxytone_re


[docs]def have_prosodic_liaison(first_syllable, second_syllable): """Checks for prosodic liaison between two syllables :param first_syllable: Dictionary with key syllable (str) and is_stressed (bool) representing the first syllable :param second_syllable: Dictionary with key syllable (str) and is_stressed (bool) representing the second syllable :return: `True` if there is prosodic liaison and `False` otherwise :rtype: bool """ if second_syllable['syllable'][0].lower() == 'y' and ( len(second_syllable['syllable']) > 1) and ( second_syllable['syllable'][1].lower() in set('aeiouáéíúó')): return False else: return (first_syllable['syllable'][-1] in LIAISON_FIRST_PART and second_syllable['syllable'][0] in LIAISON_SECOND_PART)
[docs]def get_syllables_word_end(words): """Get a list of syllables from a list of words extracting word boundaries :param words: List of dictonaries of syllables for each word in a line :return: List of dictionaries of syllables with an extra is_word_end key :rtype: list """ syllables = [] for word in words: if "symbol" in word: continue for i, syllable in enumerate(word["word"]): if i == len(word["word"]) - 1: syllable["is_word_end"] = True syllables.append(syllable) return syllables
[docs]def get_phonological_groups(word_syllables, liaison_type="synalepha", breakage_func=None, liaison_positions=None): """Get a list of dictionaries for each phonological group on a line and joins the syllables to create phonological groups (pronounced together) according to a type of liaison, either synaloepha or sinaeresis :param word_syllables: List of dictionaries for each word of the line :param liaison_type: Which liaison is going to be performed synalepha or sinaeresis :param breakage_func: Function to decide when not to break a liaison that is specified in liaison_positions :param liaison_positions: Positions of the liaisons :return: A list of conjoined syllables :rtype: list """ syllables = word_syllables[:] liaison_property = f"has_{liaison_type}" if liaison_positions is None: liaison_positions = [int(syllable.get(liaison_property, 0)) for syllable in syllables] skip_next = False while sum(liaison_positions) > 0: liaison_index = [] reduced_syllables = [] for idx, syllable in enumerate(syllables): if skip_next: skip_next = False continue breakage = False if idx < len(syllables) - 1: next_syllable = syllables[idx + 1] breakage = ( breakage_func is not None and breakage_func(liaison_type, syllable, next_syllable) ) if liaison_positions[idx] and not breakage: boundary_index = syllable.get(f'{liaison_type}_index', []) boundary_index.append(len(syllable.get('syllable')) - 1) liaison = { 'syllable': (syllable["syllable"] + next_syllable["syllable"]), 'is_stressed': (syllable["is_stressed"] or next_syllable["is_stressed"]), f'{liaison_type}_index': boundary_index, } for prop in (liaison_property, "is_word_end"): has_prop = next_syllable.get(prop, None) if has_prop is not None: liaison[prop] = has_prop reduced_syllables.append(liaison) liaison_index.append(liaison_positions[idx + 1]) skip_next = True else: reduced_syllables.append(syllable) liaison_index.append(0) liaison_positions = liaison_index syllables = reduced_syllables poem_dict = clean_phonological_groups( syllables, liaison_positions, liaison_property ) return poem_dict
[docs]def clean_phonological_groups(groups, liaison_positions, liaison_property): """Clean phonological groups so their liaison property is consistently set according to the the liaison positions :param groups: Phonological groups to be cleaned :param liaison_positions: Positions of the liaisons :param liaison_property: The liaison type (synaeresis or synalepha) :return: Cleaned phonological groups :rtype: dict """ clean_groups = [] for idx, group in enumerate(groups): if liaison_property in group: clean_groups.append({ **group, liaison_property: bool(liaison_positions[idx]) }) else: clean_groups.append(group) return clean_groups
def get_length_ranges(phonological_groups, length): count_liaisons = 0 for syllable in phonological_groups: count_liaisons += len(syllable.get("synalepha_index", [])) count_liaisons += len(syllable.get("sinaeresis_index", [])) length_ranges_dict = { "min_length": length, "max_length": length + count_liaisons} return length_ranges_dict
[docs]def get_rhythmical_pattern(phonological_groups, rhythm_format="pattern", rhyme_analysis=False): """Gets a rhythm pattern for a poem in either "pattern": "-++-+-+-" "binary": "01101010" or "indexed": [1,2,4,6] format :param phonological_groups: a dictionary with the syllables of the line :param rhythm_format: The output format for the rhythm :param rhyme_analysis: Whether or not rhyme analysis is to be performed :return: Dictionary with with rhythm and phonological groups :rtype: dict """ stresses = get_stresses(phonological_groups) stress = format_stress(stresses, rhythm_format) stresses_length = len(stresses) rhythmical_pattern = { "stress": stress, "type": rhythm_format, "length": stresses_length, } if rhyme_analysis: length_range = get_length_ranges(phonological_groups, stresses_length) rhythmical_pattern.update({ "length_range": length_range }) return rhythmical_pattern
[docs]def get_stresses(phonological_groups): """Gets a list of stress marks, `True` for stressed, `False` for unstressed from a list of phonological groups applying rules depending on the ending stress. :param phonological_groups: a dictionary with the phonological groups (syllables) of the line :return: List of boolean values indicating whether a group is stressed (`True`) or not (`False`) :rtype: list """ stresses = [] last_word_syllables = [] for group in phonological_groups: stresses.append(group["is_stressed"]) for group in phonological_groups: last_word_syllables.append(group.get("is_word_end", False)) # Get position for the last syllable of the penultimate word if last_word_syllables.count(True) > 1: penultimate_word = -( [i for i, n in enumerate(last_word_syllables[::-1]) if n][1] + 1) else: penultimate_word = None last_stress = -(stresses[::-1].index(True) + 1) # Oxytone (Aguda) if last_stress == -1: stresses.append(False) # Paroxytone (Esdrújula) elif last_stress == -3: if penultimate_word is None: stresses.pop() elif last_stress > penultimate_word: stresses.pop() return stresses
[docs]def format_stress(stresses, rhythm_format="pattern", indexed_separator="-"): """Converts a list of boolean elements into a string that matches the chosen rhythm format: "indexed": 2,5,8 "pattern": -++--+-+- "binary": 01101001 :param stresses: List of boolean elements representing stressed syllables :param rhythm_format: Format to be used: indexed, pattern, or binary :param indexed_separator: String to use as a separator for indexed pattern :return: String with the stress pattern :rtype: str """ separator = "" if rhythm_format == 'indexed': stresses = [ str(index + 1) for index, stress in enumerate(stresses) if stress ] separator = indexed_separator elif rhythm_format == 'binary': stresses = map(lambda stress: str(int(stress)), stresses) else: # rhythm_format == 'pattern': stresses = map(lambda stress: "+" if stress else "-", stresses) return separator.join(stresses)
""" Syllabifier functions """
[docs]def apply_exception_rules(word): """Applies presyllabification rules to a word, based on Antonio Ríos Mestre's work :param word: A string to be checked for exceptions :return: A string with the presyllabified word :rtype: str """ # Vowel + w + vowel group if W_VOWEL_GROUP.match(word): match = W_VOWEL_GROUP.search(word) if match is not None: word = "-".join(match.groups()) # Consonant groups with exceptions for LL and DL if CONSONANT_GROUP.match(word): match = CONSONANT_GROUP.search(word) if match is not None: word = "-".join(match.groups()) if CONSONANT_GROUP_EXCEPTION_LL.match(word): match = CONSONANT_GROUP_EXCEPTION_LL.search(word) if match is not None: word = "-".join(match.groups()) if CONSONANT_GROUP_EXCEPTION_DL.match(word): match = CONSONANT_GROUP_EXCEPTION_DL.search(word) if match is not None: word = "-".join(match.groups()) # Prefix 'sin' followed by consonant if PREFIX_SIN_WITH_CONSONANT_RE.match(word): match = PREFIX_SIN_WITH_CONSONANT_RE.search(word) if match is not None: word = "-".join(match.groups()) # Prefix 'des' followed by consonant if PREFIX_DES_WITH_CONSONANT_RE.match(word): match = PREFIX_DES_WITH_CONSONANT_RE.search(word) if match is not None: word = "-".join(match.groups()) return word
[docs]def apply_exception_rules_post(word): """Applies presyllabification rules to a word, based on Antonio Ríos Mestre's work :param word: A string to be checked for exceptions :return: A string with the presyllabified word with hyphens :rtype: str """ # We make one pass for every match found so we can perform # several substitutions matches = HIATUS_FIRST_VOWEL_RE.findall(word) if matches: for _ in matches[0]: word = re.sub(HIATUS_FIRST_VOWEL_RE, r'\1\2-\3', word) regexes = (CONSONANT_CLUSTER_RE, LOWERING_DIPHTHONGS_WITH_H, RAISING_DIPHTHONGS_WITH_H) for regex in regexes: matches = regex.findall(word) if matches: for _ in matches[0]: word = re.sub(regex, r'\1\2\3', word) return word
[docs]def syllabify(word, alternative_syllabification=False): """Syllabifies a word. :param word: The word to be syllabified. :param alternative_syllabification: Wether or not the alternative syllabification is used :return: List of syllables and exceptions where appropriate. :rtype: list """ output = "" original_word = word # Checks if word exists on the foreign words dictionary if word in SYLLABIFICATOR_FOREIGN_WORDS_DICT: output = SYLLABIFICATOR_FOREIGN_WORDS_DICT[word] else: word = apply_exception_rules(word) while len(word) > 0: output += word[0] # Returns first matching pattern. m = letter_clusters_re.search(word) if m is not None: # Adds hyphen to syllables if regex pattern is not 5, 8, 11 output += "-" if m.lastindex not in {5, 8, 11} else "" word = word[1:] output = apply_exception_rules_post(output) # Remove empty elements created during syllabification output = list(filter(bool, output.split("-"))) if (alternative_syllabification and original_word.lower() in ALTERNATIVE_SYLLABIFICATION): return ALTERNATIVE_SYLLABIFICATION[original_word.lower()][1][0] else: return (output, ALTERNATIVE_SYLLABIFICATION.get(original_word, (None, ()))[1])
[docs]def get_orthographic_accent(syllable_list): """Given a list of str representing syllables, return position in the list of a syllable bearing orthographic stress (with the acute accent mark in Spanish) :param syllable_list: list of syllables as str or unicode each :return: Position or None if no orthographic stress :rtype: int """ word = "|".join(syllable_list) match = accents_re.search(word) position = None if match is not None: last_index = match.span()[0] position = word[:last_index].count("|") return position
[docs]def is_paroxytone(syllables): """Given a list of str representing syllables from a single word, check if it is paroxytonic (llana) or not :param syllables: List of syllables as str :return: `True` if paroxytone, `False` if not :rtype: bool """ if not get_orthographic_accent("".join(syllables)): return paroxytone_re.search(syllables[len(syllables) - 1]) is not None return False
[docs]def spacy_tag_to_dict(tag): """Creates a dict from spacy pos tags :param tag: Extended spacy pos tag ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") :return: A dictionary in the form of "{'Definite': 'Ind', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}" :rtype: dict """ if tag and '=' in tag: return dict([t.split('=') for t in tag.split('|')]) else: return {}
[docs]def get_word_stress(word, pos, tag, alternative_syllabification=False, is_last_word=False): """Gets a list of syllables from a word and creates a list with syllabified word and stressed syllable index :param word: Word string :param is_last_word: Wether or not the word is the last one of a verse :param alternative_syllabification: Wether or not the alternative syllabification is used :param pos: PoS tag from spacy ("DET") :param tag: Extended PoS tag info from spacy ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") :return: Dict with [original syllab word, stressed syllabified word, negative index position of stressed syllable or 0 if not stressed] :rtype: dict """ syllable_list, _ = syllabify(word, alternative_syllabification) word_lower = word.lower() # Handle secondary stress on adverbs ending in -mente if pos == "ADV" and word_lower[-5:] == "mente" and len(word) > 5: root = word[:-5] mente = word[-5:] stress_root = get_word_stress(root, "ADJ", "") stress_mente = get_word_stress(mente, "NOUN", "") return { 'word': stress_root['word'] + stress_mente['word'], "stress_position": stress_root['stress_position'] - len( stress_mente['word']), "secondary_stress_positions": [stress_mente['stress_position']], } # Bypass POS exceptions for the last word of a verse as it should always be # stressed if is_last_word: if len(syllable_list) == 1: stressed_position = -1 else: tilde = get_orthographic_accent(syllable_list) if tilde is not None: stressed_position = -(len(syllable_list) - tilde) # Elif the word is paroxytone (llana) # we save the penultimate syllable. elif is_paroxytone(syllable_list): stressed_position = -2 # If the word does not meet the above criteria that means # that it's an oxytone word (aguda). else: stressed_position = -1 else: if len(syllable_list) == 1: first_monosyllable = syllable_list[0].lower() if ((first_monosyllable not in UNSTRESSED_UNACCENTED_MONOSYLLABLES) and ( first_monosyllable in STRESSED_UNACCENTED_MONOSYLLABLES or pos not in ( "SCONJ", "CCONJ", "DET", "PRON", "ADP") or (pos == "PRON" and tag.get("Case") == "Nom") or (pos == "DET" and tag.get("Definite") in ( "Dem", "Ind")) or pos in ( "PROPN", "NUM", "NOUN", "VERB", "AUX", "ADV") or (pos == "ADJ" and tag.get("Poss", None) != "Yes") or (pos == "PRON" and tag.get("PronType", None) in ( "Prs", "Ind")) or (pos == "DET" and tag.get("PronType", None) == "Ind") or (pos in ("ADJ", "DET" and tag.get("Poss", None) == "Yes")) or (pos in ("PRON", "DET") and tag.get("PronType", None) in ( "Exc", "Int", "Dem")) or "".join(word).lower() in STRESSED_PRON) and ( word_lower not in UNSTRESSED_FORMS)): stressed_position = -1 else: stressed_position = 0 # unstressed monosyllable else: tilde = get_orthographic_accent(syllable_list) if tilde is not None: stressed_position = tilde - len(syllable_list) elif (pos in ("INTJ", "PROPN", "NUM", "NOUN", "VERB", "AUX", "ADV") or pos == "ADJ" or (pos == "PRON" and tag.get("PronType", None) in ( "Prs", "Ind")) or (pos == "DET" and tag.get("PronType", None) in ( "Dem", "Ind")) or (pos == "DET" and tag.get("Definite", None) == "Ind") or (pos == "PRON" and tag.get("Poss", None) == "Yes") or (pos in ("PRON", "DET") and tag.get("PronType", None) in ("Exc", "Int", "Dem")) or (word_lower in STRESSED_PRON)) and ( word_lower not in UNSTRESSED_FORMS) and ( word_lower not in POSSESSIVE_PRON_UNSTRESSED): tilde = get_orthographic_accent(syllable_list) # If an orthographic accent exists, # the syllable negative index is saved if tilde is not None: stressed_position = -(len(syllable_list) - tilde) # Elif the word is paroxytone (llana) # we save the penultimate syllable. elif is_paroxytone(syllable_list): stressed_position = -2 # If the word does not meet the above criteria that means # that it's an oxytone word (aguda). else: stressed_position = -1 else: stressed_position = 0 # unstressed out_syllable_list = [] for index, syllable in enumerate(syllable_list): out_syllable_list.append( { "syllable": syllable, "is_stressed": len(syllable_list) - index == -stressed_position }) if index < 1: continue # Sinaeresis first_syllable = syllable_list[index - 1] second_syllable = syllable if first_syllable and second_syllable and ( (first_syllable[-1] in STRONG_VOWELS and second_syllable[0] in STRONG_VOWELS) or (first_syllable[-1] in STRESSED_WEAK_VOWELS and second_syllable[0] in STRONG_VOWELS) or (first_syllable[-1] in STRONG_VOWELS and second_syllable[0] in WEAK_VOWELS) or (first_syllable[-1] in STRONG_VOWELS and second_syllable[0] == "h" and second_syllable[1] in STRONG_VOWELS)): out_syllable_list[index - 1].update({'has_sinaeresis': True}) return { 'word': out_syllable_list, "stress_position": stressed_position, }
[docs]def get_last_syllable(token_list): """Gets last syllable from a word in a dictionary :param token_list: list of dictionaries with line tokens :return: Last syllable :rtype: str """ if len(token_list) > 0: for token in token_list[::-1]: if 'word' in token: return token['word'][-1]
[docs]def get_words(word_list, alternative_syllabification=False): """Gets a list of syllables from a word and creates a list with syllabified word and stressed syllable index :param word_list: List of spacy objects representing a word or sentence :param alternative_syllabification: Whether or not the alternative syllabification is used :return: List with [original syllab. word, stressed syllab. word, negative index position of stressed syllable] :rtype: list """ syllabified_words = [] for index, word in enumerate(word_list): if word.is_alpha: if '__' in word.tag_: pos, tag = word.tag_.split('__') else: pos = word.pos_ or "" tag = word.tag_ or "" tags = spacy_tag_to_dict(tag) # If it's the last word of a verse, mark it so it's always stressed # `is` is used here to be sure it's the same spacy object if word is [w for w in word_list if w.is_alpha][-1]: stressed_word = get_word_stress(word.text, pos, tags, alternative_syllabification, is_last_word=True) else: stressed_word = get_word_stress(word.text, pos, tags, alternative_syllabification) if word.pos_ in ("AUX", "VERB") and word._.affixes_length: stressed_word.update( {'affixes_length': word._.affixes_length}) stressed_word.update({'pos': word.pos_, 'tag': word.tag_}) stressed_word.update({'pos': pos}) syllabified_words.append(stressed_word) else: syllabified_words.append({"symbol": word.text}) syllabified_words = join_affixes(syllabified_words) clean_word_list = [syll for syll in syllabified_words if "word" in syll] # Synalepha for index, word in enumerate(clean_word_list): if len(clean_word_list) != index + 1: first_syllable = clean_word_list[index]['word'][-1] second_syllable = clean_word_list[index + 1]['word'][0] if first_syllable and second_syllable and have_prosodic_liaison( first_syllable, second_syllable): first_syllable.update({'has_synalepha': True}) return syllabified_words
[docs]def join_affixes(line): """Join affixes of split words and recalculates stress :param line: List of syllabified words (dict) :return: List of syllabified words (dict) with joined affixes :rtype: list """ syllabified_words = [] indices_to_ignore = [] for index, word in enumerate(line): affixes_length = word.get('affixes_length', None) if index in indices_to_ignore: continue elif affixes_length is None: syllabified_words.append(word) else: indices_to_ignore = range(index, index + affixes_length + 1) join_word = [] for affix_index in indices_to_ignore: affix = line[affix_index]['word'] join_word += [syll["syllable"] for syll in affix] word_stress = get_word_stress("".join(join_word), word["pos"], word["tag"]) word_stress["word"][-1]["is_word_end"] = True syllabified_words.append(word_stress) # Add PoS information pos_list = [line[index]['pos'] for index in indices_to_ignore] join_pos = "+".join(pos_list) word_stress.update({'pos': join_pos}) # Handle stress exception for certain paroxytone and proparoxytone words word_list = [token for token in syllabified_words if token.get("word")] last_word = word_list[-1] stresses_list = [syll["is_stressed"] for syll in last_word["word"]] if stresses_list.count(True) >= 1: last_word_stress = stresses_list.index(True) - len(last_word["word"]) last_word_is_paroxytone = re.compile(r"VERB\+").match(last_word["pos"]) last_word_is_adverb = last_word["pos"] == "ADV" if len(last_word["word"]) >= 3: # If last word is paroxytone and have enclitic pronouns, change the # stress to the last syllable and set the rest to False if last_word_stress == -3 and last_word_is_paroxytone: set_stress_exceptions(last_word) last_word["stress_position"] = -1 # If last word is proparoxytone and is not and adverb, change the # stress to the last syllable and set the rest to False elif last_word_stress <= -4 and not last_word_is_adverb: set_stress_exceptions(last_word) last_word["stress_position"] = -1 return syllabified_words if syllabified_words else line
[docs]def set_stress_exceptions(word): """Changes stresses of a word to only the last one :param word: The word that is going to be changed :return: Word with the new stresses """ for idx, stress in enumerate(word["word"]): if idx != len(word["word"]) - 1: stress["is_stressed"] = False else: stress["is_stressed"] = True return word
[docs]def get_scansion(text, rhyme_analysis=False, rhythm_format="pattern", rhythmical_lengths=None, split_stanzas_on=None, pos_output=False, always_return_rhyme=False, rhythmical_lengths_window=8, alternative_output=False): """Generates a list of dictionaries for each line :param text: Full text to be analyzed :param rhyme_analysis: Specify if rhyme analysis is to be performed :param rhythm_format: output format for rhythm analysis :param rhythmical_lengths: List with explicit rhythmical lengths per line that the analysed lines has to meet :param split_stanzas_on: Regular expression to split text in stanzas. Defaults to None for not splitting. :param pos_output: `True` or `False` for printing the PoS of the words :param always_return_rhyme: `True` or `False` for printing rhyme pattern even if no structure is detected :param rhythmical_lengths_window: Size of the window to calculate the most frequent line length when rhythmical_lengths is False. Defaults to 8 :param alternative_output: Wheter or not to return the scansion in a new format compliant with postdata API :return: list of dictionaries per line (or list of list of dictionaries if split on stanzas) :rtype: list """ if split_stanzas_on is None: return _get_scansion( text=text, rhyme_analysis=rhyme_analysis, rhythm_format=rhythm_format, rhythmical_lengths=rhythmical_lengths, pos_output=pos_output, always_return_rhyme=always_return_rhyme, rhythmical_lengths_window=rhythmical_lengths_window, alternative_output=alternative_output, ) else: scansion = [ _get_scansion( text=stanza, rhyme_analysis=rhyme_analysis, rhythm_format=rhythm_format, rhythmical_lengths=rhythmical_lengths, pos_output=pos_output, always_return_rhyme=always_return_rhyme, rhythmical_lengths_window=rhythmical_lengths_window, alternative_output=alternative_output, ) for stanza in re.compile(split_stanzas_on).split(text) ] if alternative_output and rhyme_analysis and always_return_rhyme: scansion = transform_scansion(scansion, text) return scansion
def _get_scansion(text, rhyme_analysis=False, rhythm_format="pattern", rhythmical_lengths=None, split_stanzas_on=None, pos_output=False, always_return_rhyme=False, rhythmical_lengths_window=8, alternative_output=False): """Generates a list of dictionaries for each line :param text: Full text to be analyzed :param rhyme_analysis: Specify if rhyme analysis is to be performed :param rhythm_format: Output format for rhythm analysis :param rhythmical_lengths: List with explicit rhythmical lengths per line that the analysed lines has to meet :param split_stanzas_on: String or regular expression to split text in stanzas. Defaults to None for not splitting. :param pos_output: `True` or `False` for printing the PoS of the words :param always_return_rhyme: `True` or `False` for printing rhyme pattern even if no structure is detected :param rhythmical_lengths_window: Size of the window to calculate the most frequent line length when rhythmical_lengths is False. Defaults to 8 :param alternative_output: Wheter or not to return the scansion in a new format compliant with postdata API :return: list of dictionaries per line :rtype: list """ if isinstance(text, Doc): tokens = text else: nlp = load_pipeline() tokens = nlp(text) seen_tokens = [] lines = [] raw_tokens = [] # Handle multi-line sentences and create the line with words for token in tokens: if (token.pos_ == SPACE and '\n' in token.orth_ and len(seen_tokens) > 0): lines.append({"tokens": get_words(seen_tokens, False)}) raw_tokens.append(seen_tokens) seen_tokens = [] else: seen_tokens.append(token) if len(seen_tokens) > 0: lines.append({"tokens": get_words(seen_tokens, False)}) raw_tokens.append(seen_tokens) # Extract phonological groups and rhythm per line for line in lines: syllables = get_syllables_word_end(line["tokens"]) phonological_groups = get_phonological_groups( get_phonological_groups(syllables, liaison_type="sinaeresis") ) line.update({ "phonological_groups": phonological_groups, "rhythm": get_rhythmical_pattern(phonological_groups, rhythm_format, rhyme_analysis=rhyme_analysis) }) if rhyme_analysis: analyzed_lines = analyze_rhyme(lines, always_return_rhyme=always_return_rhyme) if analyzed_lines is not None: for rhyme in [analyzed_lines]: for index, line in enumerate(lines): line["structure"] = rhyme.get("name", "unknown") line["rhyme"] = rhyme["rhyme"][index] line["ending"] = rhyme["endings"][index] line["ending_stress"] = rhyme["endings_stress"][index] if line["ending_stress"] == 0: line["rhyme_type"] = "" line["rhyme_relaxation"] = None else: line["rhyme_type"] = rhyme["rhyme_type"] line["rhyme_relaxation"] = rhyme["rhyme_relaxation"] lines_length = len(lines) structure_length = rhythmical_lengths if rhythmical_lengths else None for idx, line in enumerate(lines): if not structure_length: # Handle repeating stanzas line_structure = line.get("structure", None) structure_length, repeating_structure = STRUCTURES_LENGTH.get( line_structure, [[], False]) if structure_length and repeating_structure: repetitions = int(lines_length / len(structure_length)) structure_length = structure_length * repetitions if structure_length: structure_length_idx = structure_length[idx] elif lines_length > 1: structure_length_idx = get_structure_from_context( lines, idx, window=rhythmical_lengths_window ) else: structure_length_idx = None if structure_length_idx is not None: if line["rhythm"]["length"] < structure_length_idx: candidates = generate_phonological_groups(raw_tokens[idx]) for candidate in candidates: rhythm = get_rhythmical_pattern( candidate, rhythm_format, rhyme_analysis=rhyme_analysis) if rhythm["length"] == structure_length_idx: line.update({ "phonological_groups": candidate, "rhythm": rhythm, }) break if not pos_output: remove_pos_from_output(lines) # scansion = remove_exact_length_matches(lines) scansion = lines if alternative_output\ and split_stanzas_on is not None\ and rhyme_analysis\ and always_return_rhyme: scansion = transform_scansion(scansion, text) return scansion
[docs]def get_structure_from_context(lines, n, window=3): """Get the most frequent line length around line n using a window :param lines: List of dictionary lines of the poem :param n: Integer with the reference position :param window: Integer with the size of the window around the reference position. Defaults to 3 :return: The most frequent line length """ context = ( lines[max(n - window, 0):n] + lines[n + 1:min(n + window + 1, len(lines))] ) lengths = [line["rhythm"]["length"] for line in context] return Counter(lengths).most_common(1)[0][0]
[docs]def remove_pos_from_output(lines): """Remove `pos` tag from the output dictionary :param lines: List of dictionary lines of the poem :return: Dictionary with the key removed :rtype: dict """ for line in lines: token_list = [token for token in line.get("tokens") if line.get("tokens")] for token in token_list: if token.get("word"): token.pop("pos") return lines
def break_on_h(liaison_type, syllable_left, syllable_right): return ( liaison_type == "synalepha" and syllable_right["syllable"][0].lower() == "h" )
[docs]def generate_phonological_groups(tokens): """Generates phonological groups from a list of tokens :param tokens: list of spaCy tokens :return: Generator with a list of phonological groups :rtype: generator """ for alternative_syllabification in (True, False): words = get_words(tokens, alternative_syllabification) syllables = get_syllables_word_end(words) for liaison in ( ("synalepha",), ("synalepha", "sinaeresis"), ("sinaeresis",), ("sinaeresis", "synalepha"), ): for ignore_synalepha_h in (break_on_h, None): for liaison_positions_1 in generate_liaison_positions( syllables, liaison[0] ): groups = get_phonological_groups( syllables[:], liaison_type=liaison[0], liaison_positions=liaison_positions_1, breakage_func=ignore_synalepha_h, ) if len(liaison) == 1: yield groups else: for liaison_positions_2 in generate_liaison_positions( syllables, liaison[1] ): yield get_phonological_groups( groups, liaison_type=liaison[1], liaison_positions=liaison_positions_2, breakage_func=ignore_synalepha_h, )
[docs]def generate_liaison_positions(syllables, liaison): """Generates all possible combinations for the liaisons on a list of syllables :param syllables: List of syllables with :param liaison: Type of liaison combination to be generated :return: Generator with a list of possible combinations :rtype: generator """ positions = [int(syllable.get(f"has_{liaison}", 0)) for syllable in syllables] # Combinations start by applying all possible liaisons: [1, 1, ...] combinations = list(product([1, 0], repeat=sum(positions))) liaison_indices = [ index for index, position in enumerate(positions) if position ] # Prioritize single liaisons non_single_liaisons = [] for combination in combinations: liaison_positions = [0] * len(positions) for index, liaison_index in enumerate(liaison_indices): liaison_positions[liaison_index] = combination[index] if has_single_liaisons(liaison_positions): yield liaison_positions else: non_single_liaisons.append(liaison_positions) for liaison_position in non_single_liaisons: yield liaison_position
[docs]def has_single_liaisons(liaisons): """Checks whether liaisons (a list of 1's and 0's) has consecutive liaisons (1's) or not :param liaisons: List of possible liaisons to apply per phonological group :return: True if no consecutive liaisons, False otherwise :rtype: bool """ return not any(i == j == 1 for i, j in zip(liaisons, liaisons[1:]))
[docs]def remove_exact_length_matches(lines): """Removes key "length_range" on lines with an exact length match :param lines: List of dictionary lines of the poem :return: Returns the lines list without the "length_range" on lines with an exact length match """ for line in lines.copy(): if "length_range" in line["rhythm"]: ranges = line["rhythm"]["length_range"] if ranges["min_length"] == ranges["max_length"]: del line["rhythm"]["length_range"] return lines
[docs]def join_syllables(token): """Join all symbols and syllables from a list of tokens into a string." :param token: List of dictionaries representing tokens :return: String of syllables """ if "symbol" in token: return token["symbol"] else: return "".join([syll["syllable"] for syll in token["word"]])
[docs]def transform_scansion(scansion, text): """Converts the old output to the new format :param scansion: Scansion dictionary :param text: Original text of the poem :return: New scansion dictionary """ poem = copy.deepcopy(scheme) stanzas_list = [] absolute_line_number = 1 is_sonnet = scansion[0][0].get("structure") == "sonnet" stanzas_content_list = text.split('\n\n') line_content_list = [] for stanza_text in stanzas_content_list: line_content_list.extend(stanza_text.splitlines()) for st_idx, stanza in enumerate(scansion): line_list = [] for line_idx, line in enumerate(stanza): new_line = copy.deepcopy(scheme["stanzaList"][0]["lineList"][0]) tokens, _, rhythm, structure, rhyme, ending, ending_stress, \ rhyme_type, rhyme_relaxation = [v for v in line.values()] print(rhythm) phonological_list = [] for pg in line["phonological_groups"]: phonological_dict = copy.deepcopy( scheme["stanzaList"][0]["lineList"][0][ "phonologicalGroups"][0]) phonological_dict.update({ "content": pg["syllable"], "isStressed": pg["is_stressed"], "isWordEnd": pg.get("is_word_end", False), "sinaeresisIndex": pg.get("sinaeresis_index"), "synalephaIndex": pg.get("synalepha_index"), }) phonological_list.append(phonological_dict) token_list = [] word_number = 1 for token in line["tokens"]: content = join_syllables(token) if "symbol" in token: token = {"content": content, "type": "symbol"} token_list.append(token) else: word_dict = copy.deepcopy( scheme["stanzaList"][0]["lineList"][0]["tokenList"][0]) word_dict.update({ "content": content, "type": "word", "wordNumber": word_number }) word_number += 1 syllable_list = [] for syl_idx, s in enumerate(token["word"]): syllable_dict = copy.deepcopy( scheme["stanzaList"][0]["lineList"][0]["tokenList"][ 0]["syllableList"][0]) syllable_dict.update({ "content": s["syllable"], }) syllable_dict.update({ "isStressed": s["is_stressed"], "isWordEnd": s.get("is_word_end", False), "hasSinaeresis": s.get("has_sinaeresis", False), "hasSynalepha": s.get("has_synalepha", False), }) for key, value in s.items(): if key != "syllable": syllable_dict[key] = value syllable_list.append(syllable_dict) word_dict.update({"syllableList": syllable_list}) token_list.append(word_dict) line_pattern_dict = copy.deepcopy( scheme["stanzaList"][0]["lineList"][0]["linePattern"]) line_pattern_dict.update({ "lineMinLength": rhythm["length_range"]["min_length"], "lineMaxLength": rhythm["length_range"]["max_length"], "syllabicMetricalLength": rhythm["length"], "patterningMetricalScheme": rhythm["stress"], "employs": rhythm["type"], "scannedLine": " ".join([syl["content"].upper() if syl[ "isStressed"] else syl["content"].lower() for syl in phonological_list]) }) rhyme_dict = copy.deepcopy( scheme["stanzaList"][0]["lineList"][0]["rhyme"]) rhyme_dict.update({ "rhymeLabel": rhyme, "ending": ending, "rhymeGrapheme": ending, "typeOfRhymeMatching": rhyme_type, "endingStressedVowel": ending[ending_stress] if ending else None, }) new_line.update({ "relativeLineNumber": line_idx + 1, "absoluteLineNumber": absolute_line_number, "content": line_content_list[absolute_line_number - 1], "linePattern": line_pattern_dict, "rhyme": rhyme_dict, "phonologicalGroups": phonological_list, "tokenList": token_list }) line_list.append(new_line) absolute_line_number += 1 stanza_dict = copy.deepcopy(scheme["stanzaList"][0]) stanza_pattern_dict = copy.deepcopy( scheme["stanzaList"][0]["stanzaPattern"]) stanza_pattern_dict.update({ "metricalType": structure, "rhymeScheme": "".join( [line["rhyme"]["rhymeLabel"] for line in line_list]), }) stanza_dict.update({ "stanzaPattern": stanza_pattern_dict, "stanzaNumber": st_idx + 1, "content": stanzas_content_list[st_idx], "lineList": line_list}) stanzas_list.append(stanza_dict) poem.update({ # "content": text, "stanzaList": stanzas_list}) if is_sonnet: poem["structure"] = "sonnet" poem["stanzas_list"][0]["structure"] = "cuarteto" poem["stanzas_list"][1]["structure"] = "cuarteto" poem["stanzas_list"][2]["structure"] = "terceto" poem["stanzas_list"][3]["structure"] = "terceto" return poem