#!/usr/bin/python3
"""Generate a few strings with unbalanced distributions to test the
regeneration of the Huffman tree when it gets too deep.

USAGE: make-test-vectors DIR

This will fill up DIR with test files.
"""
import sys
import random
from collections import defaultdict


if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
    print(__doc__)
    exit(len(sys.argv) != 2)


DIR = sys.argv[1]

SIZE = (1 << 17) + (23)  # two and a bit blocks
SIZE_NAME = "128k+"
# SIZE = (1 << 16)
# SIZE_NAME = "64"


random.seed(1)


def squares(n):
    array = []
    for i in range(n):
        a = random.random()
        b = random.random()
        array.append(int(a * b * 256))
    return bytes(array)


def skewed_choices(n):
    b = list(range(256))
    array = random.choices(b, weights=b, k=n)
    return bytes(array)


def fib_shuffle(n):
    array = []
    a, b = 1, 1
    for i in range(100):
        array.extend([i] * a)
        a, b = a + b, a
        if len(array) > 1000000:
            break
    random.shuffle(array)
    return bytes(array[:n])


def exp_shuffle(n):
    array = []
    for i in range(256):
        array.extend([i] * int(1.04 ** i))
        if len(array) > 1000000:
            break
    random.shuffle(array)
    return bytes(array[:n])


def and_rand(n):
    array = []
    for i in range(n):
        a = random.randrange(256)
        b = random.randrange(256)
        array.append(a & b)
    return bytes(array)


def betavar(n, a, b):
    array = []
    for i in range(n):
        x = random.betavariate(a, b)
        array.append(int(x * 255.999999999999))
    return bytes(array)


def repeated_alphabet(n):
    a = b'abcdefghijklmnopqrstuvwxyz'
    na = n // len(a) + 1
    s = a * na
    return s[:n]


def decayed_alphabet(n):
    s = list(repeated_alphabet(n))
    for i in range(256):
        j = random.randrange(n)
        s[j] = i

    return bytes(s)


def trigram_model(n):
    with open(__file__, 'rb') as f:
        data = f.read()
    lut = defaultdict(list)
    for a, b, c in zip(data, data[1:], data[2:]):
        k = bytes([a, b])
        lut[k].append(c)

    k = random.choice(list(lut.keys()))
    s = []
    p = k[1]
    for i in range(n + 10):
        c = random.choice(lut[k])
        s.append(c)
        k = bytes([p, c])
        p = c

    return bytes(s[10:])


def trigram_sum_model(n):
    with open(__file__, 'rb') as f:
        data = f.read()
    lut = [[random.randrange(256)] for i in range(512)]
    for a, b, c in zip(data, data[1:], data[2:]):
        lut[a + b].append(c)

    s = []
    i = random.randrange(len(data) - 1)
    a = data[i]
    b = data[i + 1]

    for i in range(n + 10):
        x = lut[a + b]
        c = random.choice(x)
        s.append(c)
        a = b
        b = c

    return bytes(s[10:])


def the_classics():
    # this used to be main()
    sq = squares(SIZE)
    ch = skewed_choices(SIZE)
    fs = fib_shuffle(SIZE)
    es = exp_shuffle(SIZE)
    ar = and_rand(SIZE)
    bv1 = betavar(SIZE, 0.1, 1.5)
    bv2 = betavar(SIZE, 0.5, 2.0)
    bv3 = betavar(SIZE, 0.05, 0.05)

    print("n     sq       ch      fs       es")
    for i in range(256):
        print(f"{i:3} {sq.count(i):5}   {ch.count(i):5}   "
              f"{fs.count(i):5}   {es.count(i):5}"
              f"{ar.count(i):5}   {bv1.count(i):5}"
              f"{bv2.count(i):5}   {bv3.count(i):5}"
              )

    for series, fn in ((sq, "square_series"),
                       (ch, "skewed_choices"),
                       (fs, "fib_shuffle"),
                       (es, "exp_shuffle"),
                       (ar, "and_rand"),
                       (bv1, "beta-variate1"),
                       (bv2, "beta-variate2"),
                       (bv3, "beta-variate3"),
                       ):
        with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
            f.write(series)


def main():
    if True:
        the_classics()
    for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
                       (trigram_model(SIZE), "trigram"),
                       (trigram_sum_model(SIZE), "trigram_sum"),
                       ):
        with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
            f.write(series)


main()
