Search Engines

Errata & Other Content not on Other Pages

yash101

Published 2/9/2025

Updated 2/9/2025

Jump to Page 📚

Demonstration of Zipfs Law in Aesops Fables

.aesop-fables-runner.js

function normalize(string) {
  return string
    .normalize('NFKC')                       // Normalization form, compatibility decomposition followed by canonical composition
    .normalize('NFD')                        // Normalization form, canonical decomposition
    .replace(/(\p{M}|\p{Emoji_Modifier}|\p{P}\p{Sc}\p{Join_Control})/gu, '')  // Remove: marks / diacritics, emoji modifiers, punctuation
    .replace(/(\p{Emoji_Presentation}|\p{Extended_Pictographic})/gu, ' $1 ')  // put spaces around emojis so we treat them as words
    .replace(/\p{White_Space}/gu, ' ')       // transform whitespace to spaces
    .replace(/(\p{Ll})(\p{Lu})/gu, '$1 $2')  // split camelCase
    .replace(/(\p{N})(\p{L})/gu, '$1 $2')    // split number followed by word without space
    .replace(/(\p{L})(\p{N})/gu, '$1 $2')    // split word followed by number without space
    // Replace special characters with spaces
    .replace(/(\.|\|\+|\*|\?|\[|\]|\^|\$|\(|\)|\{|\}|\=|\!|\<|\>|\||\:|\-|\/|,|_|#|`|'|"|~|@|%|&)/g, ' ')
    .replace(/\s+/gu, ' ')                   // remove extra whitespace between
    .toLowerCase();                          // make all text lowercase
}

import fs from 'fs/promises';

(async () => {
    const file = await fs.readFile('../../../public/objects/aesop-fables.json', 'utf8').then(JSON.parse);
    const nf = file.map(story => ({
        title: story.title,
        text: normalize(story.text),
    }));

    await fs.writeFile('.aesop-fables-analysis.json', JSON.stringify(nf, null, 2));
})();

!! node .aesop-fables-runner.js

import matplotlib.pyplot as plt
import json
import random
import numpy as np
from scipy.stats import linregress

wordBank = {}

with open('.aesop-fables-analysis.json') as f:
    fables = json.load(f)
    for fable in fables:
        for term in fable['text'].split(' '):
            term = term.strip()
            if not term:
                continue
            wordBank[term] = wordBank.get(term, 0) + 1

n, k = 5, 500

sorted_terms = sorted([(term, count) for term, count in wordBank.items() if count >= n], key=lambda x: x[1], reverse=True)

sampled_terms = random.sample(sorted_terms, min(k, len(sorted_terms)))
sampled_terms.sort(key=lambda x: x[1], reverse=True)

_, frequencies = zip(*sampled_terms)
frequencies = np.array(frequencies)
highFrequencies = frequencies[:len(frequencies) // 2]

# Ranks starting at 1 for proper log
ranks = np.arange(1, len(frequencies) + 1)

# -- Perform Zipfian regression using scipy --
log_ranks = np.log(ranks)
log_freqs = np.log(frequencies)
slope, intercept, r_value, p_value, std_err = linregress(log_ranks, log_freqs)

# We define s = -slope to match the usual Zipf form (f = C * r^-s)
s = -slope
C = np.exp(intercept)

# Compute the fitted Zipf curve
zipf_fit = C / (ranks ** s)

# Create figure with two subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 12))

# Linear scale plot
axs[0][0].plot(ranks, frequencies, marker='o', linestyle='-', label="Term frequency")
axs[0][0].plot(ranks, zipf_fit, '--', label=f"Zipf fit: s={s:.2f}, C={C:.2e}")
axs[0][0].set_xlabel('Rank (r)')
axs[0][0].set_ylabel('Frequency')
axs[0][0].set_title("Term frequency of Aesop Fables (Linear Scale)")
axs[0][0].grid(True)
axs[0][0].legend()

# Log-log scale plot
axs[0][1].plot(ranks, frequencies, marker='o', linestyle='-', label="Term frequency")
axs[0][1].plot(ranks, zipf_fit, '--', label=f"Zipf fit: s={s:.2f}, C={C:.2e}")
axs[0][1].set_xlabel('Rank (r)')
axs[0][1].set_ylabel('Frequency')
axs[0][1].set_title("Term frequency of Aesop Fables (Log-Log Scale)")
axs[0][1].set_xscale('log')
axs[0][1].set_yscale('log')
axs[0][1].grid(True, which="both", linestyle="--", linewidth=0.5)
axs[0][1].legend()

# Linear scale plot
axs[1][0].plot(ranks, frequencies, marker='o', linestyle='-', label="Term frequency")
axs[1][0].plot(ranks, zipf_fit, '--', label=f"Zipf fit: s={s:.2f}, C={C:.2e}")
axs[1][0].set_xlabel('Rank (r)')
axs[1][0].set_ylabel('Frequency')
axs[1][0].set_title("Term frequency of Aesop Fables (Linear Scale, zoomed in)")
axs[1][0].grid(True)
axs[1][0].legend()
axs[1][0].set_xlim([-5, len(ranks) // 5]) 

# Log-log scale plot
axs[1][1].plot(ranks, frequencies, marker='o', linestyle='-', label="Term frequency")
axs[1][1].plot(ranks, zipf_fit, '--', label=f"Zipf fit: s={s:.2f}, C={C:.2e}")
axs[1][1].set_xlabel('Rank (r)')
axs[1][1].set_ylabel('Frequency')
axs[1][1].set_title("Term frequency of Aesop Fables (Log-Log Scale, zoomed in)")
axs[1][1].set_xscale('log')
axs[1][1].set_yscale('log')
axs[1][1].grid(True, which="both", linestyle="--", linewidth=0.5)
axs[1][1].legend()
axs[1][1].set_xlim([1, len(ranks) // 5]) 

plt.tight_layout()
plt.show()

print(f"Fitted Zipf parameters: s = {s:.3f}, C = {C:.3e}")
print(f"R-squared: {r_value**2:.4f}, p-value: {p_value:.4g}, std-err: {std_err:.4g}")

Console Logs

Fitted Zipf parameters: s = 1.026, C = 2.576e+03

R-squared: 0.9952, p-value: 0, std-err: 0.003192

Display Output

Sources

TF-IDF