FOR n = 1:N % For each time period,
FOR t = 1:T % Observe N cases.
cState = 1; % Current health state.
cStateIdx = 1; % Location in the Q-table.
fdHist = 1; % Memory variable to save the disease history.
tHist = []; % Memory variable to save the treatment history.
cProb = 1; % The probability of the current state.
% ∊-greedy action choice: by 1-(1/log(n+2)).
pn = rand(1);
IF pn < (1-(1/log(n+2))),
[nil,drug] = max(Q{t}(cState,:));
ELSE drug = randi([1,SS]); END
% Randomly generate a new event and update the disease history.
nState = randi([1,3]); fdHist = [fdHist,nState];
% Find the location in the Q-table.
[~,nStateIdx] = ismember(fdHist,SqDiz{t+1}(:,:),'rows');
% Calculate the immediate reward associated to <s,a,s’>.
[nprob,IR] = function_IPR(t,cState,drug,cProb);
% Update the value of Q. Step-size parameter alpha=(1/sqrt(n+1)).
delta = IR+DR*max(Q{t+1}(nStateIdx,:))-
Q{t}(cStateIdx,drug);
dQ = (1/sqrt(n+2))*delta;
Q{t}(cStateIdx,drug) = Q{t}(cStateIdx,drug)+dQ;
% Save the Q-variation and compute means over 100 Q-variation values.
discrepancy(mod(n,100)+1) = abs(dQ);
IF (length(discrepancy) == 100)
mdiscrepancy = [mdiscrepancy,mean(discrepancy)];
discrepancy = [];
END;
% Update the current state and the probability with the next state.
cState = fdHist(1,end);
cStateIdx = nStateIdx;
cProb = nextprob(1,nEvent);
END
END
END
% Decide the optimal solution based on the values in the Q-tables and the feasibility test.
FOR t = 1:T
FOR h = 1:size(Q{t},1)
[v,idx] = sort(Q{t}(h,:),'descend');
FOR a = 1:size(v,2)
IF The feasibility assumptions is satisfied,
Break
END
[OptV(:,:),OptSol(:,:)] = max(Q{t},[],2);
END
END
END
|