Identifying Common Refrains / Repeating Phrases In Lang’s Fairy Story Collections#

Many stories incorporate a repeating phrase or refrain in the story, but you may need to read quite a long way into a story before you can identify that repeating phrase. So are there any tools we might be able to use

from sqlite_utils import Database

db_name = "lang_fairy_tale.db"
db = Database(db_name)
conn = db.conn

# Load in the sql magic
%load_ext sql
%sql sqlite:///$db_name
#db = Database(db_name)
              
q2 = '"pretty hen"'

_q = f'SELECT * FROM books_fts WHERE books_fts MATCH {db.quote(q2)} ;'

for row in db.query(_q):
    print(row["title"])
The House In The Wood
import nltk
from nltk.util import ngrams as nltk_ngrams

tokens = nltk.word_tokenize(row["text"])

size = 5
#for i in nltk_ngrams(tokens, size):
#    print(' '.join(i))

We could then look for repeating phrases:

import pandas as pd

df = pd.DataFrame({'phrase':[' '.join(i) for i in nltk_ngrams(tokens, size)]})
df['phrase'].value_counts()
, pretty brindled cow ,        4
And you , pretty brindled      4
you , pretty brindled cow      4
pretty brindled cow , What     4
brindled cow , What do         4
                              ..
leaving him all day without    1
for leaving him all day        1
wife for leaving him all       1
his wife for leaving him       1
to go hungry . '               1
Name: phrase, Length: 1787, dtype: int64

Really, we need to do a scan down from large token size until we find a match (longest match phrase).

But for now, let’s see what repeating elements we get from one of those search phrases:

import re

_q = 'pretty brindled cow'

for m in re.finditer(_q, row["text"]):
    # Display the matched terms and the 50 characters
    # immediately preceding and following the phrase 
    print(f'===\n{q2}: ', m.start(), m.end(), row["text"][max(0, m.start()-50):m.end()+50])
===
"pretty hen":  1566 1585 
The man said:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

'Duks,' answered the beast
===
"pretty hen":  3505 3524 ed the beasts:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

The beasts answered, 'Duks
===
"pretty hen":  4932 4951  beasts again:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

'Duks,' they said. Then th
===
"pretty hen":  6119 6138  to rest now?'

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

The animals said, 'Duks:

Make a function for that:

def find_contexts(text, phrase, width=50):
    """Find the context(s) of the phrase."""
    contexts = []
    for m in re.finditer(phrase, text):
        # Display the matched terms and the `width` characters
        # immediately preceding and following the phrase 
        contexts.append(text[max(0, m.start()-width):m.end()+width])
    return contexts

for i in find_contexts(row['text'], 'pretty brindled cow'):
    print(i,"\n==")
The man said:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

'Duks,' answered the beast 
==
ed the beasts:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

The beasts answered, 'Duks 
==
 beasts again:

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

'Duks,' they said. Then th 
==
 to rest now?'

Pretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?

The animals said, 'Duks:

 
==
find_contexts(row['text'], 'pretty brindled cow')
["\nThe man said:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' answered the beast",
 "ed the beasts:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe beasts answered, 'Duks",
 " beasts again:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' they said. Then th",
 " to rest now?'\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe animals said, 'Duks:\n\n"]

We can also make this a SQLite lookup function:

from vtfunc import TableFunction

def concordances(text, phrase, width=50):
    """Find the concordances of a phrase in a text."""
    contexts = []
    for m in re.finditer(phrase, text):
        # Display the matched terms and the `width` characters
        # immediately preceding and following the phrase
        context = text[max(0, m.start()-width):m.end()+width]
        contexts.append( (context, m.start(), m.end()) )
    return contexts


class Concordances(TableFunction):
    params = ['phrase', 'text']
    columns = ['match', 'start', 'end']
    name = 'concordance'

    def initialize(self, phrase=None, text=None):
        self._iter = iter(concordances(text, phrase))

    def iterate(self, idx):
        (context, start, end) = next(self._iter)
        return (context, start, end,)

Concordances.register(db.conn)
concordances(row['text'], 'pretty brindled cow')
[("\nThe man said:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' answered the beast",
  1566,
  1585),
 ("ed the beasts:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe beasts answered, 'Duks",
  3505,
  3524),
 (" beasts again:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' they said. Then th",
  4932,
  4951),
 (" to rest now?'\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe animals said, 'Duks:\n\n",
  6119,
  6138)]
for i in db.execute('SELECT matched.* FROM books, concordance("pretty brindled cow", books.text) as matched WHERE title="The House In The Wood";'):
    print(i)
("\nThe man said:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' answered the beast", 1566, 1585)
("ed the beasts:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe beasts answered, 'Duks", 3505, 3524)
(" beasts again:\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\n'Duks,' they said. Then th", 4932, 4951)
(" to rest now?'\n\nPretty cock, Pretty hen, And you, pretty brindled cow, What do you say now?\n\nThe animals said, 'Duks:\n\n", 6119, 6138)
# allow different tokenisers
from nltk.tokenize import RegexpTokenizer

def scanner(text, minlen=4, startlen=50, min_repeats = 3, autostop=True):
    """Search a text for repeated phrases above a minimum length."""
    # Tokenise the text
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')
    tokens = nltk.word_tokenize(text)
    
    #nltk_ngrams returns an empty list if we ask for an ngram longer than the sentence
    # So set the (long) start length to the lesser of the original provided
    # start length or the token length of the original text;
    # which is to say, the minimum of the provided start length 
    # or the length of the text
    startlen = min(startlen, len(tokens))
    
    # Start with a long sequence then iterate down to a minumum length sequence
    for size in range(startlen, minlen-1, -1):
        # Generate a dataframe containing all the ngrams, one row per ngram
        df = pd.DataFrame({'phrase':[' '.join(i) for i in nltk_ngrams(tokens, size)]})
        
        # Find the occurrence counts of each phrase
        value_counts_series = df['phrase'].value_counts()

        # If we have at least the specified number of occurrences
        # don't bother searching for any more
        if max(value_counts_series) >= min_repeats:
            if autostop:
                break
            pass
    # Return a pandas series (an indexed list, essentially)
    # containing the longest (or phrases) we found
    
    return value_counts_series[(value_counts_series>=min_repeats) & (value_counts_series==max(value_counts_series))]
scanner( row["text"] )
: Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?    3
Name: phrase, dtype: int64
# Display the first (0'th indexed) item
# (In this case there is only one item hat repeats this number of times anyway.)
scanner( row["text"] ).index[0], scanner( row["text"] ).values[0]
(': Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?',
 3)

If we constrain this function to return a single item, we can create a simple SQLite function that will search through records and return the longest phrase above a certain minimum length (or the first longest phrase, if several long phrases of the same length are found):

def find_repeating_phrase(text):
    """Return the longest repeating phrase found in a text.
       If there are more than one of the same length, return the first.
    """
    phrase = scanner(text)
    
    #If there is at least one response, take the first
    if not phrase.empty:
        return phrase.index[0]
find_repeating_phrase(row['text'])
': Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?'
# The `db` object is a sqlite_utils database object
# Pass in:
# - the name of the function we want to use in the database
# - the number of arguments it takes
# - the function we want to invoke
db.conn.create_function('find_repeating_phrase', 1, find_repeating_phrase)
_q = """
SELECT book, title, find_repeating_phrase(text) AS phrase 
FROM books WHERE title="The House In The Wood" ;
"""

for row2 in db.query(_q):
    print(row2)
{'book': 'The Pink Fairy Book', 'title': 'The House In The Wood', 'phrase': ': Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?'}
_q = """
SELECT title, find_repeating_phrase(text) AS phrase
FROM books WHERE book="The Pink Fairy Book" ;
"""

for row3 in db.query(_q):
    if row3['phrase'] is not None:
        print(row3)
{'title': 'Catherine And Her Destiny', 'phrase': 'the court , and'}
{'title': 'Esben And The Witch', 'phrase': "? ' 'Ye -- e -- s ! ' 'Are you coming back again ? ' 'That may be , ' said Esben . 'Then you 'll catch it , '"}
{'title': "Hans, The Mermaid's Son", 'phrase': ", ' said Hans ; ' I"}
{'title': 'How The Dragon Was Tricked', 'phrase': ", ' said the dragon"}
{'title': "How The Hermit Helped To Win The King's Daughter", 'phrase': "'Ask him if he will come with us"}
{'title': 'I Know What I Have Learned', 'phrase': 'and asked his wife whether the cow had calved'}
{'title': 'King Lindorm', 'phrase': 'rode out into the forest'}
{'title': 'Maiden Bright-Eye', 'phrase': ". 'Good evening , ' it said . 'Thanks , Maiden Bright-eye , ' said the dog . 'Where is my brother ? ' 'He is in the serpent-pit . ' 'Where is my wicked sister ? ' 'She is with the noble king . ' 'Alas ! alas !"}
{'title': 'Master And Pupil', 'phrase': ", ' said the boy ."}
{'title': 'Peter Bull', 'phrase': "'Oh , yes , ' said the"}
{'title': 'Princess Minon-Minette', 'phrase': ", ' replied the old woman"}
{'title': "The Bird 'Grip'", 'phrase': 'the horse with the golden shoes'}
{'title': 'The Cunning Shoemaker', 'phrase': ", ' replied the shoemaker"}
{'title': 'The Fir-Tree', 'phrase': "' thought the tree ."}
{'title': 'The Flying Trunk', 'phrase': ". '' ' ''"}
{'title': 'The Goblin And The Grocer', 'phrase': ', and he had'}
{'title': 'The Golden Lion', 'phrase': ', and the young man'}
{'title': 'The House In The Wood', 'phrase': ': Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?'}
{'title': 'The Jackal, The Dove, And The Panther', 'phrase': "which side do you turn to ? '"}
{'title': 'The King Who Would Have A Beautiful Wife', 'phrase': ". ' 'And I"}
{'title': 'The Little Hare', 'phrase': 'the little hare , the little hare ,'}
{'title': 'The Man Without A Heart', 'phrase': ", ' said the"}
{'title': 'The Merry Wives', 'phrase': ", ' said the"}
{'title': 'The Princess In The Chest', 'phrase': "'Sentry , where are you ? Sentry , where are you ?"}
{'title': 'The Shirt-Collar', 'phrase': "! ' said the shirt-collar ,"}
{'title': 'The Slaying Of The Tanuki', 'phrase': '. The Tanuki ,'}
{'title': 'The Snow-Man', 'phrase': "? ' asked the Snow-man ."}
{'title': 'The Snow-Queen', 'phrase': ", ' said the crow ,"}
{'title': 'The Sparrow With The Slit Tongue', 'phrase': 'the house , and'}
{'title': 'The Sprig Of Rosemary', 'phrase': "'Do you , rich as you are ,"}
{'title': 'The Story Of Ciccu', 'phrase': "accept them with my humble duty . '"}
{'title': 'The Three Brothers', 'phrase': "the house . '"}
{'title': "The Troll's Daughter", 'phrase': 'at the bottom of the sea .'}
{'title': 'The Two Brothers', 'phrase': 'seven years and seven months'}
{'title': 'The Water Of Life', 'phrase': 'the talking bird , and a branch of the tree of beauty'}
{'title': 'The White Dove', 'phrase': ", ' said the prince ,"}
{'title': 'The Wounded Lion', 'phrase': 'will hire me for a servant ?'}
{'title': 'Uraschimataro And The Turtle', 'phrase': 'the sea , and'}

The punctuation gets in the way somewhat, so it might be useful if removed the punctuation and tried again:

#Allow param and de-punctuate

def scanner2(text, minlen=4, startlen=50, min_repeats = 4, autostop=True, tokeniser='word'):
    """Search a text for repeated phrases above a minimum length."""
    # Tokenise the text
    if tokeniser == 'depunc_word':
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
    elif tokeniser == 'sent':
        pass
    else:
        # eg for default: tokeniser='word'
        tokenizer = RegexpTokenizer(r'\w+')
        tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')
        tokens = nltk.word_tokenize(text)
    
    #nltk_ngrams returns an empty list if we ask for an ngram longer than the sentence
    # So set the (long) start length to the lesser of the original provided
    # start length or the token length of the original text;
    # which is to say, the minimum of the provided start length 
    # or the lenth of the text
    startlen = min(startlen, len(tokens))
    
    # Start with a long sequence then iterate down to a minumum length sequence
    for size in range(startlen, minlen-1, -1):
        
        # Generate a dataframe containing all the ngrams, one row per ngram
        df = pd.DataFrame({'phrase':[' '.join(i) for i in nltk_ngrams(tokens,size)]})
        
        # Find the occurrence counts of each phrase
        value_counts_series = df['phrase'].value_counts()
        
        # If we have at least the specified number of occurrences
        # don't bother searching for any more
        if max(value_counts_series) >= min_repeats:
            if autostop:
                break
            pass
    # Return a pandas series (an indexed list, essentially)
    # containing the long phrase (or phrases) we found
    return value_counts_series[(value_counts_series>=min_repeats) & (value_counts_series==max(value_counts_series))]
def find_repeating_phrase_depunc(text, minlen):
    """Return the longest repeating phrase found in a text.
       If there are more than one of the same lentgh, return the first.
    """
    
    # Accepts a specified minimum phrase length (minlin)
    # Reduce the required number of repeats
    phrase = scanner2(text, minlen=minlen, min_repeats = 3, tokeniser='depunc_word')
    
    #If there is at least one response, take the first
    if not phrase.empty:
        return phrase.index[0]
find_repeating_phrase_depunc(row['text'], 5)
'Pretty cock Pretty hen And you pretty brindled cow What do you say now'

Register the function:

# Note we need to update the number of arguments (max. 2)
db.conn.create_function('find_repeating_phrase_depunc', 2, find_repeating_phrase_depunc)

Try again:

_q = """
SELECT book, title, find_repeating_phrase_depunc(text, 7) AS phrase
FROM books WHERE book="The Pink Fairy Book" ;
"""

for row5 in db.query(_q):
    if row5['phrase'] is not None:
        print(row5)
{'book': 'The Pink Fairy Book', 'title': 'Esben And The Witch', 'phrase': 'Ye e s Are you coming back again That may be said Esben Then you ll catch it'}
{'book': 'The Pink Fairy Book', 'title': "How The Hermit Helped To Win The King's Daughter", 'phrase': 'Ask him if he will come with us'}
{'book': 'The Pink Fairy Book', 'title': 'I Know What I Have Learned', 'phrase': 'and asked his wife whether the cow had calved'}
{'book': 'The Pink Fairy Book', 'title': 'Maiden Bright-Eye', 'phrase': 'Good evening it said Thanks Maiden Bright eye said the dog Where is my brother He is in the serpent pit Where is my wicked sister She is with the noble king Alas alas'}
{'book': 'The Pink Fairy Book', 'title': "The Bird 'Grip'", 'phrase': 'the horse with the golden shoes and'}
{'book': 'The Pink Fairy Book', 'title': 'The House In The Wood', 'phrase': 'Pretty cock Pretty hen And you pretty brindled cow What do you say now'}
{'book': 'The Pink Fairy Book', 'title': 'The Princess In The Chest', 'phrase': 'My father has set no sentry in War and Pest'}
{'book': 'The Pink Fairy Book', 'title': 'The Shirt-Collar', 'phrase': 'a boot jack and a hair brush'}
{'book': 'The Pink Fairy Book', 'title': 'The Sprig Of Rosemary', 'phrase': 'listened and was sorry for her and'}
{'book': 'The Pink Fairy Book', 'title': "The Troll's Daughter", 'phrase': 'at the bottom of the sea He'}
{'book': 'The Pink Fairy Book', 'title': 'The Water Of Life', 'phrase': 'the talking bird and a branch of the tree of beauty'}
{'book': 'The Pink Fairy Book', 'title': 'The Wounded Lion', 'phrase': 'Who will hire me for a servant'}

Check the context:

_q = """
SELECT text, find_repeating_phrase(text) AS phrase
FROM books WHERE title="Maiden Bright-Eye" ;
"""

for row6 in db.query(_q):
    for c in find_contexts(row6['text'], "Where is my wicked ", 100):
        print(c,"\n===")
    #print(row6['phrase'])
'Thanks, Maiden Bright-eye,' said the dog.

'Where is my brother?'

'He is in the serpent-pit.'

'Where is my wicked sister?'

'She is with the noble king.'

'Alas! alas! I am here this evening, and shall be for two e 
===


'Thanks, Maiden Bright-eye,' said the dog.

'Where is my brother?'

'He is in the serpent-pit.'

'Where is my wicked sister?'

'She is with the noble king.'

'Alas! alas! I am here this evening, and shall be for one e 
===


'Thanks, Maiden Bright-eye,' said the dog.

'Where is my brother?'

'He is in the serpent-pit.'

'Where is my wicked sister?'

'She is with the noble king.'

'Alas! alas! now I shall never come again.' With this it sl 
===
for row6 in db.query(_q):
    for c in find_contexts(row6['text'], "the king's palace", 100):
        print(c,"\n===")
 something about the stepson. He had gone out into the world to look about him, and took service in the king's palace. About this time he got permission to go home and see his sister, and when he saw how lovely and be 
===
 he saw how lovely and beautiful she was, he was so pleased and delighted that when he came back to the king's palace everyone there wanted to know what he was always so happy about. He told them that it was because h 
===
r life, and she was at once transformed into a duck. The duck swam away after the ship, and came to the king's palace on the next evening. There it waddled up the drain, and so into the kitchen, where her little dog l 
===
it.

At this time the brother in the serpent-pit dreamed that his right sister had come swimming to the king's palace in the shape of a duck, and that she could not regain her own form until her beak was cut off. He g 
===

We need to be able to find short sentences down to the minimum that are not in a longer phrase:

def scanner_all(text, minlen=4, startlen=50, min_repeats = 4, autostop=True):
    long_phrases = {}
    tokens = nltk.word_tokenize(text)
    for size in range(startlen, minlen-1, -1):
        df = pd.DataFrame({'phrase':[' '.join(i) for i in nltk_ngrams(tokens, min(size, len(tokens)))]})
        value_counts_series = df['phrase'].value_counts()
        
        if max(value_counts_series) >= min_repeats:
            test_phrases = value_counts_series[value_counts_series==max(value_counts_series)]
            for (test_phrase, val) in test_phrases.iteritems():
                if (test_phrase not in long_phrases) and not any(test_phrase in long_phrase for long_phrase in long_phrases):
                    long_phrases[test_phrase] = val
            
    return long_phrases
txt_reps ="""
Nota that There once was a thing that and 5 There once was a thing that and 4 There once was a thing that and 3
There once was a thing that and 1 There once was a thing that and  6 There once was a thing that and 7
there was another that 1 and there was another that 2 and there was another that 3 and there was another that and
there was another that and there was another that 5 and there was another that 9 and there was another that
"""
scanner( txt_reps )
There once was a thing that and    6
Name: phrase, dtype: int64
scanner_all(txt_reps)
{'There once was a thing that and': 6, 'and there was another that': 7}
scanner_all( row["text"])
{'Pretty cock , Pretty hen , And you , pretty brindled cow , What do you say now ?': 4}

Longest Common Substring#

Could we use difflib.SequenceMatcher.find_longest_match() on first and second half of doc, or various docs samples, to try to find common refrains?

Or chunk into paragraphs and compare every paragraph with every other paragraph?

Here’s how the to call the SequenceMatcher().find_longest_match() function:

from difflib import SequenceMatcher

m = SequenceMatcher(None, txt_reps.split('\n')[1], txt_reps.split('\n')[2]).find_longest_match()
m, txt_reps.split('\n')[1][m.a: m.a + m.size]
(Match(a=9, b=33, size=33), ' There once was a thing that and ')

Doc2Vec Search Engine#

To explore: a simple Doc2Vec powered search engine based on https://www.kaggle.com/hgilles06/a-doc2vec-search-engine-cord19-new-version .