#=   ARX-Model - recursive implementation
For the arx model we assume a structure like
y(n)+a_1*y(n-1)+...+a_na*y(n-na) = b1*u(n-nk)+...+b_nb*u(n-nk-nd+1)+e(n),
which results in
theta = [a1        a2  ...  a_na    b_1     ... b_nb]'
phi   = [-y(n-1)] -y(n-2)   -y(n-na) u(n-nk)     u(n-nk-nb+1)]'

Our best prediction is now:
y(n) = theta^T*phi
                                                                    =#
function arx(iddata::iddataObject, na::Int64, nb::Int64, nk::Int64)
  y      =   iddata.y;
  u      =   iddata.u;
  N      =   length(y);

  # find time horizone (largest value going back in history)
  n1 = nk+nb-1;
  n2 = na;
  if n1>=n2
    timeHorizone = n1;
  else
    timeHorizone = n2;
  end

  fn      = zeros(na+nb);
  Rn      = zeros(na+nb, na+nb)
  for n=1:N
    # discard the first 'timeHorizone' samples, start from then on
    if n>timeHorizone
      phi   =   [flipdim(-iddata.y[n-na:n-1],1); flipdim(iddata.u[n-nk-nb+1:n-nk],1)]

      fn    =   fn + phi*y[n];
      Rn    =   Rn + phi*transpose(phi);
    end
  end

  # average these time summed values
  fn   = fn/(N-timeHorizone);
  Rn   = Rn/(N-timeHorizone);

  # calculate final values
  theta = inv(Rn)*fn;

  # create Model Output
  Model = createModelOutput(theta,"arx",na,nb,0,na,nk,iddata.Ts)

  return Model;
end

#=   arx
This arx method doesn't use the recursive implementation (unlike the previous function) - it uses the Gauß-Newton approach.

Author : Lars Lindemann @2015
                                                                    =#

function arx(iddata::iddataObject,X0::Array{Float64},nf::Int64,nb::Int64,nk::Int64,stabilityFix::Bool=false,stepSizeControl::Bool=true)

  maxIterations= 30;
  exitTreshold = 0.0001;
  y            = iddata.y;
  u            = iddata.u;

  A            = X0[1:nf];
  B            = X0[nf+1:nf+nb];
  theta        = [A ; B];
  T            = theta;
  V            = zeros(maxIterations);
  mu           = 1;
  V_out        = 0;

  # find time horizon (largest value going back in history)
  n1             = nb+nk-1;
  n              = findmax([n1; nf]);
  timeHorizon    = n[1] + 1;
  N              = length(y)-timeHorizon+1;

  result = false;
  counter = 1;

  while(!result)

    # Calculate first and second order derivatives
    V_g,V_h,V[counter] = calcDerivativesARX(y,u,A,B,timeHorizon,nf,nb,nk);

    # if MSE got worse in the last step, control the step size
    if counter == 1
      theta = theta - mu*(V_h\V_g);
      T     = [T theta];
    elseif ( V[counter]>V[counter-1] || isnan(V[counter]) ) && stepSizeControl
      # reset counter and lower step size
      counter    -= 1;
      mu         *= 0.1;
      # calculate new theta
      theta                    = T[:,size(T,2)-1];
      V_g,V_h,V[counter+1]     = calcDerivativesARX(y,u,theta[1:nf],theta[nf+1:nf+nb],timeHorizon,nf,nb,nk);
      theta                    = theta - mu*(V_h\V_g);
      T[:,size(T,2)]           = theta;
    # if MSE got better in the last step
    else
      # reset mu to 1
      mu = 1;

      # exit criterion after a successfull step
      if abs(V[counter]-V[counter-1])<exitTreshold
        result = true;
        V_out  = V[counter]*N;
      elseif counter == maxIterations
        result = true
        V_out  = V[counter]*N;
      else
        # use steepest descent
        theta = theta - mu*(V_h\V_g);
        T     = [T theta];
      end
    end

    # check for stability and inverse roots if necessary
    if stabilityFix
      theta[1:nf]  = checkStability(theta[1:nf])
    end

    # set new thetas for the next step
    A     = theta[1:nf];
    B     = theta[nf+1:nf+nb];

    counter +=1;
  end

  # build idModel as output
  Model = createModelOutput(theta,"arx",nf,nb,0,nf,nk,iddata.Ts,V_out,N)

  return Model;
end

#=   arx_func
Calculates the loss function for PEM2 with ForwardDiff, where x is the
theta parameter vector

Author : Lars Lindemann @2015
                                                                    =#

function arx_func(x)
  global VV = 0;
  V = 0;

  for i = timeHorizon:length(y)
    # 1. calculate the one step ahead predictions
    y_prediction = 0;

    for ia = 1:nf
      y_prediction   += -x[ia]*y[i-ia];
    end
    for ib = 1:nb
      y_prediction   += x[nf+ib]*u[i-ib-nk+1];
    end

    # 2. calculate actual V
    V        += (y[i]-y_prediction)^2

  end

  VV = V/(length(y)-timeHorizon+1);

  return VV;
end

#=   arxDerivatives
Calculates grammian and hessian of the arx loss function for PEM

Author : Lars Lindemann @2015
                                                                    =#

function calcDerivativesARX(y::Array{Float64},u::Array{Float64},A::Array{Float64},B::Array{Float64},
                              timeHorizon::Int64,nf::Int64,nb::Int64,nk::Int64)

  y_prediction = zeros(length(y));
  psi          = zeros(length(y),nf+nb);
  V_g          = zeros(nf+nb);
  V_h          = zeros(nf+nb,nf+nb);
  e            = 0;

  for i = timeHorizon:length(y)
    # 1. calculate the one step ahead predictions
    y_prediction[i]    += (-transpose(A)*y[i-1:-1:i-nf])[1];
    y_prediction[i]    += (transpose(B)*u[i-nk:-1:i-nb-nk+1])[1];

    # 2. calculate grammian of the one step ahead predictions
    psi[i,1:nf]             = collect(-y[i-1:-1:i-nf].');
    psi[i,nf+1:nf+nb]       = collect(u[i-nk:-1:i-nk-nb+1].');

    # 3. calculate grammian and hessian of the quadratic criterion
    V_g   += -psi[i,:].'*(y[i]-y_prediction[i]);
    V_h   += psi[i,:].'*psi[i,:];

    # calculate summed error
    e     += (y[i]-y_prediction[i])^2;
  end

  # normalize
  V_g = V_g/(length(y)-timeHorizon+1);
  V_h = V_h/(length(y)-timeHorizon+1);
  VV  = e/(length(y)-timeHorizon+1);

  return V_g,V_h,VV;
end
