import pandas as pd
import matplotlib.pyplot as plt
import click

# col_name = 'tx_amount'
def get_avg(df):
    col_name = df.columns[0]
    m = df[col_name].mean()
    return m

def get_median(df):
    median = df[ df.columns[0] ].median()
    return median

def make_ccdf(df):
    col_name = df.columns[0]
    ccdf = df.sort_values(by=[col_name], ascending=False).to_numpy()
    n = len(ccdf)
    rank = list(range(1,n+1))
    plt.loglog(ccdf, rank)
    plt.ylabel('N * CCDF')
    plt.xlabel('amount')
    plt.title('Answer to HW4 Q2(b)')
    plt.grid()
    plt.show()


# fname="amounts.csv.gz"

@click.command()
@click.argument('fname', type=click.Path(exists=True, readable=True), nargs=1)
def main(fname):
    amounts = pd.read_csv(fname)
    # print('\n')
    description = amounts.describe()
    # print(description)
    x_list =[]
    avg_list = []
    median_list = []
    for n in range(5,20):
        first_n = 2**n
        avg = get_avg(amounts.head(first_n))
        median = get_median(amounts.head(first_n))
        # print(f'{2**n=} {avg=} {median=}')
        x_list.append(2**n)
        avg_list.append(avg)
        median_list.append(median)

    print("HW4 Q2a: plotting mean and median, vs sample-size")
    plt.loglog(x_list, avg_list,  'b-', x_list, median_list, 'g.')
    plt.xlabel('# samples')
    plt.ylabel('avg/median')
    plt.legend(['avg', 'median'])
    plt.title('Answer to HW4 Q2(a)')
    plt.grid()
    plt.show()

    print(f'     (not required) avg = {round(get_avg(amounts), 2)}')
    print(f'     (not required) median = {round(get_median(amounts), 2)}')

    print("HW4 Q2b - plotting the CCDF")
    make_ccdf(amounts)




if __name__ == "__main__":
    main()
