import polars as pl
import polars.selectors as cs
from cmdstanpy import CmdStanModel
import numpy as np
from scipy.special import logit
from patsy import dmatrices, dmatrix, build_design_matrices, standardize
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az

!cat model.stan

data {
    int W;
    int Y;
    int F;
    int G;
    matrix[F,4] X;
    vector[F] y;
    row_vector[4] pred_X;
    matrix[51,G] dem_logits;
    vector[F+1] dem_inc;
    matrix[51,51] S;
    int N;
    array[N] int obs_n;
    array[N] int obs_year;
    array[N] int obs_week;
    array[N] int obs_state;
    array[N] int obs_dems;
}
transformed data {
  matrix[51,G] deltas = dem_logits -
    rep_matrix(((2 * dem_inc[F-G+1:F] - 1) .* logit(y[F-G+1:]))', 51);
}
parameters {
    vector[4] beta;
    real<lower=0, upper=1> pred_y;
    array[Y,W] vector[51] steps;
    real<lower=0> l;
    real<lower=0> sigma;
    real<lower=0> delta_sigma;
    vector[51] pred_delta;
}

transformed parameters {
    array[Y, W] vector[51] m;
    m[Y,W] = (2 * dem_inc[F+1] - 1) * pred_y + pred_delta;
    for (p in 1:Y-1) m[Y-p,W] = dem_logits[,G-p+1];
    for (i in 1:(W - 1)) for (p in 1:Y) {
      m[p, W - i] = m[p, W - i + 1] + l * S * steps[p,W - i];
    }
}

model {
    sigma ~ exponential(1 / sd(y));
    l ~ exponential(1);
    delta_sigma ~ exponential(1);
    beta ~ normal(0, sd(y) * 2.5);
    y ~ normal(X * beta, sigma);
    pred_y ~ normal(pred_X * beta, sigma);
    for (p in 1:Y) for (w in 1:W) {
      steps[p,w] ~ std_normal();
    }
    for (i in 1:N) {
      obs_dems[i] ~ binomial_logit(obs_n[i], m[obs_year[i], obs_week[i]][obs_state[i]]);
    }
    for (i in 1:51) {
      deltas[i,2:] ~ normal(deltas[i,:G-1], delta_sigma);
    }
    pred_delta ~ normal(deltas[,G], delta_sigma);
}

abramowitz = pl.read_csv("../data/abramowitz_data.csv").sort(
    pl.col("year")).with_columns(incvote = pl.col("incvote") / 100)

sns.relplot(abramowitz.to_pandas(), x="juneapp", y="q2gdp", hue="incvote");

fundamentals_training = abramowitz.filter(pl.col("year") < 2016)

y, X = dmatrices("incvote ~ standardize(juneapp) + standardize(q2gdp) + term2", fundamentals_training.to_pandas())

pred_X = build_design_matrices([X.design_info], abramowitz.filter(pl.col("year") >= 2016).to_pandas())[0][0]

dem_inc = abramowitz.select(pl.col("deminc")).to_numpy()[:, 0]

state_vote_data = pl.read_csv("../data/potus_results_76_16.csv").filter(pl.col("year") < 2016).select(
    pl.col(["year", "state"]),
    dem_frac= pl.col("dem") / (pl.col("dem") + pl.col("rep"))
).sort(pl.col(["state", "year"]))

dem_logits = np.reshape(logit(state_vote_data.select("dem_frac")), (51, -1))

state_vals = state_vote_data.pivot(
    on="state", index="year", values="dem_frac"
).select(pl.col(pl.Float64))

S = np.linalg.cholesky(np.cov(state_vals, rowvar=False) + 1e-4 * np.eye(51))

states = pl.Enum(state_vote_data.get_column("state").unique().sort())

def prepare_poll_data(data, dem, rep, fmt, year):
    return data.filter(pl.col("state") != "--").with_columns(
        pl.col("^.*date$").str.to_datetime(fmt),
        total_votes = (pl.col("number.of.observations") * ((pl.col(dem) + pl.col(rep)) / 100)).round().cast(pl.Int32),
    ).select(
        pl.col(["total_votes", "start.date", "end.date"]),
        week=(pl.col("start.date") + ((pl.col("end.date") - pl.col("start.date")) / 2)).dt.week(),
        state=pl.col("state").cast(states),
        election_year=year,
        dem_votes= (pl.col("total_votes") * (pl.col(dem) / (pl.col(dem) + pl.col(rep)))).round().cast(pl.Int32)  
    )

poll_data = pl.concat([
    prepare_poll_data(pl.read_csv('../data/all_polls_2008.csv', infer_schema_length=10000), "obama", "mccain", "%m/%d/%y", 1),
    prepare_poll_data(pl.read_csv('../data/all_polls_2012.csv', infer_schema_length=10000), "obama", "romney", "%m/%d/%y", 2),
    prepare_poll_data(pl.read_csv('../data/all_polls.csv').with_columns(
        pl.col("number.of.observations").str.to_integer(strict=False).drop_nans()), "clinton", "trump", "%Y-%m-%d", 3)
]).drop_nulls()

poll_data.group_by(pl.col("election_year")).agg(min=pl.col("start.date").min(), max=pl.col("start.date").max())

poll_data = poll_data.filter(pl.col("week") > 40)

poll_data = poll_data.with_columns(week=pl.col("week") - pl.col("week").min() + 1)

data = {
    "obs_week": poll_data.get_column("week").to_numpy(),
    "obs_year": poll_data.get_column("election_year").to_numpy(),
    "obs_dems": poll_data.get_column("dem_votes").to_numpy(),
    "obs_state": poll_data.get_column("state").to_physical().to_numpy() + 1,
    "obs_n": poll_data.get_column("total_votes").to_numpy(),
    "N": poll_data.shape[0],
    "S": S,
    "dem_inc": dem_inc,
    "dem_logits": dem_logits,
    "X": np.asarray(X),
    "pred_X": np.asarray(pred_X),
    "y": np.asarray(y)[:,0],
    "W": poll_data.get_column("week").max(),
    "Y": 3,
    "F": X.shape[0],
    "G": dem_logits.shape[1]
}

model = CmdStanModel(stan_file="model.stan", cpp_options={'STAN_THREADS':'true'})

22:00:41 - cmdstanpy - INFO - compiling stan file /Users/sam/Desktop/us-potus-model/simplified/model.stan to exe file /Users/sam/Desktop/us-potus-model/simplified/model
22:00:47 - cmdstanpy - INFO - compiled model executable: /Users/sam/Desktop/us-potus-model/simplified/model

fit = model.sample(data=data)

22:00:55 - cmdstanpy - INFO - CmdStan start processing

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

22:04:58 - cmdstanpy - INFO - CmdStan done processing.
22:04:58 - cmdstanpy - WARNING - Non-fatal error during sampling:
Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 47, column 4 to column 32)
	Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 47, column 4 to column 32)
	Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 47, column 4 to column 32)
	Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 56, column 6 to column 57)
	Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 47, column 4 to column 32)
	Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in 'model.stan', line 56, column 6 to column 57)
Consider re-running with show_console=True if the above output is unclear!

poll_data.group_by("state").agg(count=pl.len()).sort("count", descending=True).head()

m = fit.stan_variable("m")

az.plot_hdi(1 + np.arange(10), m[None,:, 1, :, list(states.categories).index('FL')], smooth=False)
for (w,l) in poll_data.filter(pl.col("state") == "FL").group_by("week").agg(pl.len()).iter_rows():
    plt.axvline(x=w, c='grey', linewidth=l, alpha=0.2)

az.plot_hdi(1 + np.arange(10), m[None,:, 2, :, list(states.categories).index('FL')], smooth=False)
for (w,l) in poll_data.filter(pl.col("state") == "FL").group_by("week").agg(pl.len()).iter_rows():
    plt.axvline(x=w, c='grey', linewidth=l, alpha=0.2)

inf1 = az.from_cmdstanpy(posterior=fit, observed_data=data,  posterior_predictive="obs_hat")

az.plot_ppc(inf1, data_pairs = {'obs_dems' : 'obs_hat'}, var_names='obs_dems');

az.plot_loo_pit(inf1, y="obs_dems", y_hat="obs_hat")

<Axes: xlabel='obs_dems'>

election_year	min	max
i32	datetime[μs]	datetime[μs]
1	2008-01-04 00:00:00	2008-10-31 00:00:00
2	2012-02-27 00:00:00	2012-11-05 00:00:00
3	2015-06-18 00:00:00	2016-11-06 00:00:00

state	count
enum	u32
"FL"	109
"OH"	103
"NC"	83
"VA"	81
"PA"	73

Election Forecasting with a Random Walk through the Polls¶