function bj(iddata::iddataObject,X0::Array{Float64},nf::Int64,nb::Int64,nc::Int64,nd::Int64,nk::Int64;stabilityFix::Bool=false,stepSizeControl::Bool=true)

  maxIterations= 30;
  exitTreshold = 0.0001;
  y            = iddata.y;
  u            = iddata.u;

  F            = X0[1:nf];
  B            = X0[nf+1:nf+nb];
  C            = X0[nf+nb+1:nf+nb+nc];
  D            = X0[nf+nb+nc+1:nf+nb+nc+nd];
  theta        = [F ; B ; C ; D];
  T            = theta;
  V            = zeros(maxIterations);
  mu           = 1;
  V_out        = 0;

  # find time horizon (largest value going back in history)
  n              = findmax([nb+nk+nd-1;  nc+nf; nd+nf]);
  timeHorizon    = n[1] + 1;
  N              = length(y)-timeHorizon+1;

  result  = false;
  counter = 1;
  while(!result)

    # Calculate first and second order derivatives
    if !isnan(F[1]) && !isnan(B[1]) && !isnan(C[1]) && !isnan(D[1])
      V_g,V_h,V[counter] = calcDerivativesBJ(y,u,F,B,C,D,timeHorizon,nf,nb,nc,nd,nk);
    else
      V[counter] = NaN;
    end

    # if MSE got worse in the last step, control the step size
    if counter == 1
      theta = theta - mu*(V_h\V_g);
      T     = [T theta];
    elseif ( V[counter]>V[counter-1] || isnan(V[counter]) ) && stepSizeControl
      # reset counter and lower step size
      counter    -= 1;
      mu         *= 0.1;
      # calculate new theta
      theta                    = T[:,size(T,2)-1];
      V_g,V_h,V[counter+1]     = calcDerivativesBJ(y,u,theta[1:nf],theta[nf+1:nf+nb],theta[nf+nb+1:nf+nb+nc],theta[nf+nb+nc+1:nf+nb+nc+nd],timeHorizon,nf,nb,nc,nd,nk);
      theta                    = theta - mu*(V_h\V_g);
      T[:,size(T,2)]           = theta;
    # if MSE got better in the last step
    else
      # reset mu to 1
      mu = 1;

      # exit criterion after a successfull step
      if abs(V[counter]-V[counter-1])<exitTreshold
        result = true;
        V_out  = V[counter]*N;
      elseif counter == maxIterations
        result = true
        V_out  = V[counter]*N;
      else
        # use steepest descent
        theta = theta - mu*(V_h\V_g);
        T     = [T theta];
      end
    end

    # check for stability and inverse roots if necessary
    if stabilityFix
      theta[nf+nb+nc+1:nf+nb+nc+nd]   = checkStability(theta[nf+nb+nc+1:nf+nb+nc+nd])
      theta[1:nf]                     = checkStability(theta[1:nf])
    end

    # set new thetas for the next step
    F     = theta[1:nf];
    B     = theta[nf+1:nf+nb];
    C     = theta[nf+nb+1:nf+nb+nc];
    D     = theta[nf+nb+nc+1:nf+nb+nc+nd];

    counter +=1;
  end

  # build idModel as output
  Model = createModelOutput(theta,"bj",nf,nb,nc,nd,nk,iddata.Ts,V_out,N)

  return Model
end


function calcDerivativesBJ(y::Array{Float64},u::Array{Float64},F::Array{Float64},B::Array{Float64},C::Array{Float64},D::Array{Float64},
                              timeHorizon::Int64,nf::Int64,nb::Int64,nc::Int64,nd::Int64,nk::Int64)

  y_prediction = zeros(length(y));
  psi          = zeros(length(y),nf+nb+nc+nd);
  V_g          = zeros(nf+nb+nc+nd);
  V_h          = zeros(nf+nb+nc+nd,nf+nb+nc+nd);
  e            = 0;
  ncf          = nc + nf;
  ndf          = nd + nf;
  ndb          = nd + nk + nb - 1;

  # build some help polynomials
  CF = (Poly([1;C])*Poly([1;F])).a;
  DF = (Poly([1;D])*Poly([1;F])).a;
  DB = (Poly([1;D])*Poly(B)).a;  # here will occur a problem, if nk!=0 or the first element of B==0, solve with eps?! or do I have to break out nk every time I use DB?


  for i = timeHorizon:length(y)

    # 1. calculate the one step ahead predictions
    y_prediction[i]    += (transpose(DB)*u[i-nk:-1:i-nk-ndb])[1];
    y_prediction[i]    += (transpose(CF)*y[i:-1:i-ncf])[1];
    y_prediction[i]    += (-transpose(DF)*y[i:-1:i-ndf])[1];
    y_prediction[i]    += (-transpose(CF[2:end])*y_prediction[i-1:-1:i-ncf])[1];


    # 2. calculate grammian of the one step ahead predictions
    for j= 1:nf
      psi[i,j]            = (transpose([1;C])*(y[i-j:-1:i-j-nc]-y_prediction[i-j:-1:i-j-nc]) - transpose([1;D])*y[i-j:-1:i-j-nd] - transpose(CF[2:end])*psi[i-1:-1:i-ncf,j])[1];
    end
    for j= 1:nb
      psi[i,nf+j]         = (transpose([1;D])*u[i-j-nk+1:-1:i-j-nk-nd+1] - transpose(CF[2:end])*psi[i-1:-1:i-ncf,nf+j])[1];
    end
    for j= 1:nc
      psi[i,nf+nb+j]      = (transpose([1;F])*(y[i-j:-1:i-j-nf]-y_prediction[i-j:-1:i-j-nf]) - transpose(CF[2:end])*psi[i-1:-1:i-ncf,nf+nb+j])[1];
    end
    for j= 1:nd
      psi[i,nf+nb+nc+j]   = (-transpose([1;F])*y[i-j:-1:i-j-nf] + transpose(B)*u[i-j-nk:-1:i-j-nk-nb+1] - transpose(CF[2:end])*psi[i-1:-1:i-ncf,nf+nb+nc+j])[1];
    end

    # 3. calculate grammian and hessian of the quadratic criterion
    V_g   += -psi[i,:].'*(y[i]-y_prediction[i]);
    V_h   += psi[i,:].'*psi[i,:];

    # calculate summed error
    e     += (y[i]-y_prediction[i])^2;
  end

  # normalize
  V_g    = V_g/(length(y)-timeHorizon+1);
  V_h    = V_h/(length(y)-timeHorizon+1);
  VV     = e/(length(y)-timeHorizon+1);

  return V_g,V_h,VV;

end

