% Select the key parameters for reinforcement learning.
T = 4; % The number of drug switching period.
DR = 0.8; % Discount rate.
N = 1000000; % The number of cases (1000, 10000 and 100000 were also tested).
FR = 1; % Future reward considered (from one to three steps).
% Define the health state space decomposed by the time period. The possible health states include the previous disease history.
HS1 = 1;
HS2 = [1,1;1,2];
HS3 = [1,1,1;1,1,2;1,2,1;1,2,2];
HS4 = [1,1,1,1;1,1,1,2;1,1,2,1;1,1,2,2;...
1,2,1,1;1,2,1,2;1,2,2,1;1,2,2,2];
HS5 = [1,1,1,1,1;1,1,1,1,2;1,1,1,2,1;1,1,1,2,2;...
1,1,2,1,1;1,1,2,1,2;1,1,2,2,1;1,1,2,2,2;...
1,2,1,1,1;1,2,1,1,2;1,2,1,2,1;1,2,1,2,2;...
1,2,2,1,1;1,2,2,1,2;1,2,2,2,1;1,2,2,2,2];
HS = {HS1,HS2,HS3,HS4,HS5};
% Define the search space decomposed by the time period.
SS1 = (1:1:4); % Possible treatment options 1-4 at t1.
SS2 = (1:1:10); % Possible treatment options 1-10 at t2.
SS3 = (1:1:14); % Possible treatment options 1-14 at t3.
SS4 = (1:1:14); % Possible treatment options 1-14 at t4.
SS5 = (1:1:14); % Possible treatment options 1-14 at t5.
SS = {SS1,SS2,SS3,SS4};
% Initialise the Q-tables for each time period to 0.
Q1 = zeros(size(HS1,1),size(SS1,2)); % 1x4 matrix at t1.
Q2 = zeros(size(HS2,1),size(SS2,2)); % 2x10 matrix at t2.
Q3 = zeros(size(HS3,1),size(SS3,2)); % 4x14 matrix at t3.
Q4 = zeros(size(HS4,1),size(SS4,2)); % 8x14 matrix at t4.
Q5 = zeros(size(HS5,1),size(SS5,2)); % 16x14 matrix at t5.
Q = {Q1,Q2,Q3,Q4,Q5};
% Initialise the parameters to check convergence.
Discrepancy = []; mDiscrepancy = [];
% Initialise the solution tables storing the optimal solutions and the maximum reward where the optimal solutions were used.
OptSeq = zeros(2^(T-1),T); MaxV = zeros(2^(T-1),T);
% Repeat calculating Q-values for N times.
FOR n = 1:N
cState = 1; % Initial state.
cStateIdx = 1; % Location of the current state in the Q-table.
dHist = 1; % Memory variable to save the disease history.
tHist = []; % Memory variable to save the treatment history.
cMT = 0; % Maintenance therapy.
cProb = 1; % The probability of the initial state.
cSBP = 173.5; cSBPSD = 21.1; % Initial SBP and SD.
% For each time period, generate a subsequent states from cState to a terminal state.
|