
	***********************
	*****EPHOR PROGRAM***** 
	***********************


	***FACTORS TO BE MANIPULATED

	**MAIN SILICA EFFECT
	**SECONDARY DIESEL EFFECT
	**SECONDARY SMOKING EFFECT
	**CORRELATION MATRIX BETWEEN EXPOSURES
	**INTERVETION EFFECT (TYPE AND SIZE)
	**AGE OF RETIREMENT
	**CALENDAR YEAR OF EFFECT
	**SAMPLE SIZE, DEFAULT 5000
	**REPETIONS, DEFAULT 1000
	**STORAGE LOCATION

	**Possbible extras
	**decay diesel & smoking
	**latency average, min, max, silica/diesel,smoking
	**number of interventions

quietly program define ephorhia
			version 18.0
			syntax [ , sileff(real 1.05) diseff(real 1.0005) smkeff(real 1.42) ///
			interact(real 1) silinteff(integer 100) disinteff(integer 100) decay(integer 0) retage(integer 65) ///
			year(integer 2010) n(integer 1000) reps(integer 1000)  ///
			silintn(string) disintn(string) corrmat(name) save(string) seed(integer 3002) ///
			]

	di "Check user written commands gentrun and gtools are installed, intall via SSC if required"
	which gentrun
	which gtools
	*gtools, upgrade
			
	///replace string intervention with none if missing		
			if "`silintn'" == "" local silintn "none"		
			if "`disintn'" == "" local disintn "none"		
			
		

	// Check if Silica effect n is provided and positive
		if `sileff' < 1 {
			display as error "Option silica effect must be a positive value greater than 1."
			exit 198
		}

	// Check if diesel fumes effect n is provided and positive
		if `diseff' < 1 {
			display as error "Option diseff() must be a positive value greater than 1."
			exit 198
		}
	// Check if smoking effect n is provided and positive
		if `smkeff' < 1 {
			display as error "Option n() must be a positive value greater than 1."
			exit 198
		}

	// Validate the silinteff() option between 1 and 100
		if `silinteff' < 1 | `silinteff' > 100 {
			display as error "Option silinteff() must be an integer between 1 and 100."
			exit 198
		}

	// Validate the disinteff() option between 1 and 100
		if `disinteff' < 1 | `disinteff' > 100 {
			display as error "Option disinteff() must be an integer between 1 and 100."
			exit 198
		}
		
	// Validate the decay() option between 0 and 30
		if `decay' < 0 | `decay' > 30 {
			display as error "Option decay() must be an integer between 0 and 30."
			exit 198
		}

	// Validate the retage() option between 50 and 80
		if `retage' < 50 | `retage' > 80 {
			display as error "Option retage() must be an integer between 50 and 80."
			exit 198
		}

	// Validate the year() option between 1990 and 2020
		if `year' < 1980 | `year' > 2020 {
			display as error "Option year() must be an integer between 1980 and 2020."
			exit 198
		}

	// Validate the interact() option between 0 and 10
		if `interact' < 0 | `interact' > 10 {
			display as error "Option interact() must be an integer between 0 and 10."
			exit 198
		}
		
	// Validate the string intervention against a list of acceptable options

	if inlist("`silintn'", "ave", "max", "none") == 0 {
			display as error "Invalid value for exposure intervention. Acceptable values are: ave, max, none."
			exit 198
		}

	if inlist("`disintn'", "ave", "max", "none") == 0 {
			display as error "Invalid value for exposure intervention. Acceptable values are: ave, max, none."
			exit 198
		}
		
		
	// Set a default correlation matrix if none is provided
		if "`corrmat'" == "" {
			matrix default_corr = (1, 0, 0 \ 0, 1, 0 \ 0, 0, 1)
			matrix C = default_corr
			di "Using default correlation matrix:"
		} 
		
		else {
		
			// Assign the provided matrix
			matrix C = `corrmat'
			// Optional: Validate that it is square and symmetric
			local rows = rowsof(C)
			local cols = colsof(C)
			if `rows' != `cols' {
				display as error "The provided matrix is not square. A correlation matrix must be square."
				exit 198
			}
			
			if issymmetric(C) == 0 {
			display as error "The provided matrix is not symmetric. A correlation matrix must be symmetric."
			exit(198)
			} 
			
			else {
			di "Using the provided correlation matrix for silica, smoking, diesel:"
			matrix list C				
			}
			
			}	


		
	// Check if the user provided a save location
		if "`save'" == "" {
			display as error "You must provide a save location."
			exit 198
		}

	// Validate the save location (if additional checks are needed)
	// Here you could add logic to verify file extensions, permissions, etc.
	//check file exists
		if fileexists("`save'/LC_datafile") {
		display as error "The file `savelocation' already exists. This file & saved plots will be replaced."
		}
	//check valid path
		local dir = substr("`save'", 1, strrpos("`savelocation'", "/"))
		if "`dir'" != "" & !fileexists("`dir'") {
			display as error "The directory `dir' does not exist. Please provide a valid path."
			exit 198
		}
	//check naming rules, no special characters
		if regexm("`save'", "[\\*?\<>|]") {
			display as error "The file name contains invalid characters: \ / : * ? \ < > |"
			exit 198
		}
		
		
			di in green "USER (or Default) DEFINED PARAMETERS"
			di in green ""
			di in green "Relative Risk per Cumulative Increase in Silica = `sileff'"	
			di in green "Relative Risk per Cumulative Increase in Diesel Fumes = `diseff'"	
			di in green "Relative Risk per Cumulative Increase in Smoking Pack Years = `smkeff'"	
			di in green "User Defined Intervention type for Silica = `silintn'"
			di in green "User Defined Intervention type for Diesel Fumes = `disintn'"
			di in green "User Defined Intervention effect for both Silica = `silinteff'"
			di in green "User Defined Intervention effect for both Diesel Fumes = `disinteff'"
			di in green "Years of half-life decay for silica = `decay'"
			di in green "Retirement Age = `retage'"			
			di in green "Intervention Year = `year'"			
			di in green "Sample size = `n'"			
			di in green "Number of = `reps'"	
			di in green "Correlation Matrix for the three exposures"
			matrix list C
			di in green "Interaction effect between Silica & Diesel (Note 1 = no interaction) = `interact'"
			di in green "Save Destination = `save'"
			di in green "Random Number Seed = `seed'"

		
		
		***INTEVENTION OCCURS AT 2010 - technically the main analysis

	clear
	quietly save "`save'/LC_datafile", emptyok replace

	**Simulations start date/time
	noisily di "Start Date/Time = $S_DATE $S_TIME"
	***Set random seed and begin loop

	set seed `seed'
	noisily display in yellow "Number of reps completed" 
	timer on 1
	quietly forvalues r = 1(1)`reps' {
	clear
	**set size of simulated sample
	set obs `n'
	gen id=_n

	**BUILD BASE CHARACTERISTICS OF CONSTRUCTION WORKERS
	**i.e. define any characteristic that is fixed at the indiviual level
	****************************************************

	**FIXED CONSTRUCTION WORK CHARACTERISTICS

	**gen age and year at entry to construction work**
	**gen length of time in construction before leaving/retirment**
	**age at entry-->using mata command RTNORM: random truncated normal distribution, min 16 yrs, max 30 yrs, mean 20 years, sd 3**
	**RTNORM causes issues with the loop, so switched to gentrun
	**gent run simulates under standard normal distribution, convert to mean relevant age and sd 
	gentrun double age_at_entry, left(-1.3333) right(3.3333)
	replace age_at_entry=(age_at_entry*3)+20
	label var age_at_entry "Age of entry to construction ind"

	**year of entry: uniform min 1970 max 2020,3000 new workers per decade for a total 15000 workers** 
	gen year_of_entry=floor(runiform(1940, 2060))
	label var year_of_entry "Year of entry to construction ind"

	**gen variable of max year leaving construction/retirement i.e. assumes everyone would work to retirement 65 if perfectly health i.e. max exposure period
	gen year_at_maxexit=year_of_entry+(`retage'-age_at_entry)
	label var year_at_maxexit "Year of expected retirement without ill health"

	**gen max work duration i.e. time between year_at_start
	gen max_wk_duration=year_at_maxexit-year_of_entry
	label var max_wk_duration "Max time in work (assume in wrk till retirment age 65)"

	**gen age leaving construction/retirment
	gen age_at_maxexit=age_at_entry+max_wk_duration
	label var age_at_maxexit "age leaving construction"

	**PARTICIPANT CHARACTERISTICS
	**generate birth year
	gen year_of_birth=round(year_of_entry-age_at_entry)
	label var year_of_birth "Year of birth"
		
	*Generate gender. random uniform number.
	**Should we reflect % of women working in construction(~11%, 10% engineering, 1% on-site)???**
	**Will need to change gender % in future**
	gen sex=(runiform()<0.11)
	label define sex 0 "Male" 1 "Female", replace
	label values sex sex
	tab sex

	**INDIVIDUALS LATENCY CHARACTERISTIC - SMK/SIL

	**generate a randomly assigned latency period for each individual (Assumes smoking and silica have own indep latency periods)
	**note diesel defined as 10 years fixed

	**Smoking lognormal distribution for latency, i.e. from 10 to 40 years, mean=20 years, sd=(40-10)/6**
	gen loglatency_smk = exp(ln(10) + ((ln(40)-ln(10))/6) * invnorm(uniform()))

	**Silica lognormal distribution for latency, i.e. from 10 to 40 years, mean=20 years, sd=(40-10)/6**
	gen loglatency_sil = exp(ln(10) + ((ln(50)-ln(10))/6) * invnorm(uniform()))


	***FIXED SMOKING CHARACTERISTICS

	***We build the simulation for lung cancer starting with smoking***
	*********************************************************************
	***NO CONSTANCES yet***


	**Initally assume 60% of our population aged between 16-30 begin smoking***
	***Starting in 1970s this is assumed to drop by 5% per decade until 2030***
	***but still higher to the general population***
	**I generate a 60% of smokers starting in the 70s, 55% in the 80s....**
	gen smoking_YN=(runiform()<=0.6) if year_of_entry<1980
	label var smoking_YN "Ever Smoker Yes/No" 

	*gen decade of entry
	gen decade_of_entry=floor((year_of_entry-1970)/10)
	tab decade_of_entry
	sum decade_of_entry
	forvalues i = 1(1)`r(max)' {
		replace smoking_YN=(runiform()<=(0.6*(1-(0.05*`i')))) if decade_of_entry==`i' & year_of_entry<2030

	}
	replace smoking_YN=(runiform()<=(0.6*(1-(0.05*(floor(2030-1970)/10))))) if year_of_entry>=2030

	tab year_of_entry smoking_YN, row

	***We randomly assign a smoking initiation year, and duration using
	***truncated normal distribution with a mean 20 years (s.d 5 years), min 14, max 30***
	***We randomly assign a duration 30 years (s.d. 10 years), min 1, max 50 yrs***
	***I use a mata command (rtnorm) to draw a truncated normal distribution for***


	***Age at smk initiation
	gentrun double age_at_start, left(-1.2) right(2)
	replace age_at_start=(age_at_start*5)+20
	label var age_at_start "Age when start smoking"  

	**duration of smoking in years
	gentrun double smoking_duration, left(-2.9) right(2)
	replace smoking_duration=(smoking_duration*10)+30
	label var smoking_duration "length of time smoking yrs"  

	***no need for smoking start age or duration for those never started smoking***
	foreach var in age_at_start smoking_duration {
		replace `var'=. if smoking_YN==0
	}
	 
	**gen year at the time started/stopped smoking based on age at starting smoking**
	gen year_at_start=round(year_of_entry-age_at_entry+age_at_start)
	label var year_at_start "Year started smoking"
	gen year_at_stop=round(year_at_start+smoking_duration)
	label var year_at_stop "Year stopped smoking"


	***START TO BUILD EXPOSURE HISTORIES 
	**steps
	**1. simulated between person exposure variation 'between_person' smk/silica/diesel using drawnorm code.
	**2. expand data for each persons total follow up i.e. 100 years of follow up
	**3. gen individuals, annual population average smk year_pop_ave based on decade of year i.e 18.5 if 1970s droping on cig per decade
	**4. gen within person variation in exposure over the course of exposure period
	**5. combined pop_ave + between_person + within_person to get annual individual exposure values
	**note, all of above needs to be in the log_normal scale



	****DEFINING THE FIXED BETWEEN PERSON EXPOSURE VARIATION 
	**between person variation in Smoking, Respirable Crystalline Silica (RCS) and diesel fumes exposures using drawnorm
	**Repeat three times for an indep corr structure (all 3 corr=0), max corr structure (all 3 corr = 0.5), and bespoke (0.1 with smk, 0.5 sil/dis)
	**Note, Bespoke corr 0.1, 0.1, 0.5****************************


	***between exposure corr indep
	**set matrix of correlation between exposures (note 0 = indpendent)
	*matrix C = (1, 0, 0 \ 0, 1, 0 \ 0, 0, 1)
	**set matrix vector of average exposure (ensure follows a normal arthimetic scale hence ln())
	matrix m=(0,0,0)
	**set matrix vector of sd of exposures (note set on the normal arthimetic scale hence conversions for sil/dis geometric mean/sd i.e. ln() and smk artmetic means/sd i.e. sqrt(ln(... (note here had to assume mean 18.5 through out despite it changing over time))
	**assume sd remains constant over time
	matrix sd=(ln(4.5), sqrt(ln(1+(5^2/18.5^2))), ln(2.8))
	drawnorm between_sili between_smki between_disi, means(m) sds(sd)  corr(C)

	label var between_sili "silica between person var normal scale indep" 
	label var between_smki "smk between person var normal scale indep"
	label var between_disi "diesel between person var normal scale indep"



	**expand data 100 years for each individual post 14 i.e. first year posible smoker 
	**t, by year at work**
	expand 100
	bys id: gen t=_n
	label var t "sequential year since age 14"
	gen age=14+(t-1)
	label var age "Age at year (min=14)"
	gen year=year_of_birth+14+(t-1)
	label var year "corresponding year since age 14"


	**Generate annual concentration of exposure to smk/diesel/silica in turn 

	*******SMOKING EXPOSURE HISTORIES*********
	 **NEED TO CALCULATE EXPOSURE IN PACK-YEARS BY DECADE**

	**We first simulate number of cigarettes per day**
	**starts with arthmetic mean 18.5 in 1970s cohort dropping by 1 cigarette per decade until its 14.5 in 2010s**
	**Then apply factor to reduce by 1 cig per decade, stop reducing at 2030**

	**gen population_ave cig_per_day smok
	gen pop_ave_smk=18.5 if smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	replace pop_ave_smk=17.5 if year>=1980 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	replace pop_ave_smk=16.5 if year>=1990 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	replace pop_ave_smk=15.5 if year>=2000 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	replace pop_ave_smk=14.5 if year>=2010 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	replace pop_ave_smk=13.5 if year>=2020 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	*replace pop_ave_smk=12.5 if year>=2030 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	*replace pop_ave_smk=11.5 if year>=2040 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop

	**modify for female smoking habits
	replace pop_ave_smk=pop_ave_smk*0.5 if sex==1

	*if year<2000 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop & sex==1
	*replace pop_ave_smk=13.5 if year<2000 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop & sex==1
	*replace pop_ave_smk=pop_ave_smk-1 if year>2000 & smoking_YN==1 & year>=year_at_start & year<=year_at_stop & sex==1


	**assumes arthmetic mean is from log normal distribution of smoking 
	**convert pop_ave to 'normal' dist values
	replace pop_ave_smk=ln(pop_ave_smk^2/sqrt(2^2+pop_ave_smk^2))


	*gen within person variation assuming log_normal dis of smoking - annual variation within person 
	gen within_smk=rnormal(0,ln(1.04))


	**gen combined final annual cigarattes per day for indep/max0.5/bespoke corr
	**We need to know the smoking duration at current time so we can calculate pack-years at that time**
	gen smoking_dur_at_time=(year-year_at_start)+1 if year>=year_at_start
	replace smoking_dur_at_time=smoking_dur_at_time[_n-1] if year>year_at_stop
	replace smoking_dur_at_time=0 if smoking_dur_at_time==.


	**combine three components & convert back to log_normal distribution
	gen smk_cpdi=exp(pop_ave_smk+between_smki+within_smk)

	**ensure non-smokers and the years prior to start, and post stopping smoking are set to 0
	replace smk_cpdi=0 if smoking_YN==0 | year<year_at_start | year>year_at_stop
	**ensure smokers are not smoking more than 100 annual cig per day
	local c = 1
	while `c' > 0 {
	bysort id (t): replace smk_cpdi=smk_cpdi[_n-1] if smk_cpdi>100 & smk_cpdi[_n-1]!=0
	bysort id (t): replace smk_cpdi=smk_cpdi[_n+1] if smk_cpdi>100 & smk_cpdi[_n+1]!=0
	bysort id (t): replace smk_cpdi=smk_cpdi[_n-1] if smk_cpdi==. & smk_cpdi[_n-1]!=.
	count if smk_cpdi>100
	local c = `r(N)'	
	replace smk_cpdi=99 if smk_cpdi==. & smoking_YN==1 & year>=year_at_start & year<=year_at_stop
	}


	**GEN cumulative cigarretes_per_day durng life**
	bysort id (t):gen smoke_cumulativei=sum(smk_cpdi)

	**Pack-years at current time using  cig per day at that time**
	**first calculate pack years annual**
	gen pack_years_annuali=smk_cpdi/20
	replace pack_years_annuali=0 if smoking_dur_at_time==.
	**Now generate pack years acumulated at time**
	gen pack_yearsi=((smoke_cumulativei/smoking_dur_at_time)/20)*smoking_dur_at_time
	

	label var smk_cpdi "Simulated annual ave Cig per day log-norm dist" 


	**start by creating a variable for 100 previous years leading up to current year of exposure
	forvalues lg = 0(1)100 {
	bysort id (t): gen smk_l0_d10_lagi`lg'=pack_years_annuali[_n-`lg']*2^(-`lg'/10)
	}

	**gen cumulative exposure for current year 
	egen pack_yearsi_l0_d10=rowtotal(smk_l0_d10_lagi*)
	drop smk_l0_d10_lag*

	**Latency log, decay 10**
	**use loglatency generated before expanded**
	bysort id (t): gen pack_yearsi_l10_d10=pack_yearsi_l0_d10[_n-loglatency_smk] 	
	replace pack_yearsi_l10_d10=0 if t<=loglatency_smk+1
	rename pack_yearsi_l10_d10 pack_yearsi_base



	*******DIESEL EXPOSURE HISTORIES********

	gen pop_ave_dis=.

	*replace smoking pop_ave per decade untile decade 7 (2040s)
	replace pop_ave_dis=23.267 if year<1975
	replace pop_ave_dis=20.000 if year>=1975 & year<1980
	replace pop_ave_dis=17.181 if year>=1980 & year<1985
	replace pop_ave_dis=14.759 if year>=1985 & year<1990
	replace pop_ave_dis=12.677 if year>=1990 & year<1995
	replace pop_ave_dis=10.891 if year>=1995 & year<2000
	replace pop_ave_dis=9.355 if year>=2000 & year<2005
	replace pop_ave_dis=8.036 if year>=2005 & year<2010
	replace pop_ave_dis=6.902 if year>=2010 & year<2015
	replace pop_ave_dis=5.929 if year>=2015 & year<2020
	replace pop_ave_dis=5.929 if year>=2020
	*replace pop_ave_dis=2.867 if year>=2025 & year<2030
	*replace pop_ave_dis=2.462 if year>=2030

	**modify for female exposure
	replace pop_ave_dis=pop_ave_dis*.10 if sex==1


	*gen within person variation
	gen within_dis=rnormal(0,ln(1.11))

	**convert to normal dist and combine as pop-ave is geometric means
	replace pop_ave_dis=ln(pop_ave_dis)

	******CREATE BASE EXPOSURE & INTERVENTIONS FOR INTERVENTION YEAR ONWARDS*****

	**gen final annual diesel exposure per day
	gen dis_expi=exp(pop_ave_dis+between_disi+within_dis)
	**ensure dis_exp is 0 on years not at work
	replace dis_expi=0 if year<year_of_entry | year>year_at_maxexit


	gen dis_expi_int=dis_expi
	**replace with aver reduction if intevention is ave reduction, max exposure level if intervention in max, or none if no intervention is required
	**average exposure decrease by user defined intervention effect for diesel

	if "`disintn'" == "ave" {
	replace dis_expi_int=dis_expi*(`disinteff'/100) if year>=`year' 
	}
	**max exposure limit by user defined intervention effect for diesel


	if "`disintn'" == "max" {
	_pctile dis_expi if year==`year', percentiles(`disinteff')
	replace dis_expi_int=`r(r1)' if sil_expi_int>`r(r1)' & year>=`year' 
	}

	**Cumulative exposures**
	bysort id (t): gen diesel_cumi=sum(dis_expi)
	**repeat for interventions 
	bysort id (t): gen diesel_cumi_int=sum(dis_expi_int)





	label var dis_expi "Simulated annual ave Diesel Exposure log-norm dist indep corr" 
	label var dis_expi_int "Diesel exposure under reduction intervention post 2010 indep corr"
	label var diesel_cumi_int "Cumulative Diesel exposure with ave reduction intevention 2010+ indep corr"




	**gen decaying risk = 0 i.e. no half-life decay
	forvalues lg = 0(1)100 {
	**gen decaying exposure for each year prior to year t back to t=0 
	bysort id (t): gen diesel_l0_d0_lagi`lg'=dis_expi[_n-`lg']
	bysort id (t): gen disint_l0_d0_lagi`lg'=dis_expi_int[_n-`lg']
	}

	egen dieselcumi_l0_d0=rowtotal(diesel_l0_d0_lagi*)
	egen disintcumi_l0_d0=rowtotal(disint_l0_d0_lagi*)

	drop diesel_l0_d0_lag* disint_l0_d0_lag*

	**formula for lagged exposure with half-life decay = lagged e(t) = e(0)*2^(-t/half-life deacy time)
	**gen latency = 10 for decay = 0
	foreach lat in 10 {
	bysort id (t): gen dieselcumi_base=dieselcumi_l0_d0[_n-`lat'] 	
	replace dieselcumi_base=0 if t<=`lat' 

	**repeat for intevention 
	bysort id (t): gen dieselcumi_int=disintcumi_l0_d0[_n-`lat'] 	
	replace dieselcumi_int=0 if t<=`lat' 
	}
		


	label var dieselcumi_base "Cumulative diesel exposure baseline exposure"
	label var dieselcumi_int "Cumulative diesel exposure intevention applied" 



	*******SILCA EXPOSURE HISTORIES********
	*assumes log-normal distibution with starting geometric average of 0.238mg/m3 and GSD 3.51 from 1960 to 1975**

	gen pop_ave_sil=.
	*replace smoking pop_ave per decade untile decade 7 (2040s)
	replace pop_ave_sil=0.404 if year<=1975
	replace pop_ave_sil=0.296 if year>=1975 & year<1980
	replace pop_ave_sil=0.218 if year>=1980 & year<1985
	replace pop_ave_sil=0.160 if year>=1985 & year<1990
	replace pop_ave_sil=0.117 if year>=1990 & year<1995
	replace pop_ave_sil=0.087 if year>=1995 & year<2000
	replace pop_ave_sil=0.063 if year>=2000 & year<2005
	replace pop_ave_sil=0.046 if year>=2005 & year<2010
	replace pop_ave_sil=0.034 if year>=2010 & year<2015
	replace pop_ave_sil=0.025 if year>=2015 & year<2020
	replace pop_ave_sil=0.025 if year>=2020
	*replace pop_ave_sil=0.008 if year>=2025 & year<2030
	*replace pop_ave_sil=0.007 if year>=2030

	**modify for female exposure
	replace pop_ave_sil=pop_ave_sil*.10 if sex==1

	*gen within person variation
	gen within_sil=rnormal(0,ln(1.19))

	**covert to normal dist and combine
	replace pop_ave_sil=ln(pop_ave_sil)


	gen sil_expi=exp(pop_ave_sil+between_sili+within_sil)
	replace sil_expi=0 if year<year_of_entry | year>year_at_maxexit

	**average exposure decrease ave by user defined intervention - 
	gen sil_expi_int=sil_expi


	if "`silintn'" == "ave" {
	replace sil_expi_int=sil_expi*(`silinteff'/100) if year>=`year'

	}


	**max exposure decrease to max by user defined intervention 
	if "`silintn'" == "max" {
	_pctile sil_expi if year==`year', percentiles(`silinteff')
	replace sil_expi_int=`r(r1)' if sil_expi_int>`r(r1)' & year>=`year' 

	}


	**Cumulative exposure**
	bysort id (t): gen silica_cumi=sum(sil_expi)
	**repeat for interventions 30% ave & 75percentile max
	bysort id (t): gen silica_cumiint=sum(sil_expi_int)


	label var sil_expi "Simulated annual ave silica exposure log-norm dist base" 
	label var sil_expi_int "silica exposure under intervention post 2010"

	label var silica_cumi "Cumulative silica exposure base"
	label var silica_cumiint "Cumulative silica exposure under reduction due to intervention 2010+"


	**Decaying risk and latency versions

	**gen decaying risk = 0 i.e. no half-life decay
	forvalues lg = 0(1)100 {

	**gen decaying exposure for each year prior to year t back to t=0
	**formula for lagged exposure with half-life decay = laggede(t) = e(0)*2^(-t/half-life deacy time)
	bysort id (t): gen silica_l0_d0_lagi`lg'=sil_expi[_n-`lg']
	bysort id (t): gen silint_l0_d0_lagi`lg'=sil_expi_int[_n-`lg']
	}

	egen silicacumi_l0_d0=rowtotal(silica_l0_d0_lagi*)
	egen silintcumi_l0_d0=rowtotal(silint_l0_d0_lagi*)

	drop silica_l0_d0_lag* silint_l0_d0_lag* 


	**gen latency=10log for decay = 0
	**use loglatency_sil generated before expanded**
	bysort id (t): gen silcumi_base=silicacumi_l0_d0[_n-loglatency_sil]	
	replace silcumi_base=0 if t<loglatency_sil+1 

	bysort id (t): gen silcumi_int=silintcumi_l0_d0[_n-loglatency_sil] 	
	replace silcumi_int=0 if t<loglatency_sil+1 




	**define decaying risk as  5 15  half-life
	foreach dec in `decay' {
	**gen lagged exposure per person per year from joining workforce in year t=0
	forvalues lg = 0(1)100 {
	**gen decaying exposure for each year prior to year t back to t=0
	**formula for lagged exposure with half-life decay = laggede(t) = e(0)*2^(-t/half-life deacy time) 
	bysort id (t): gen sili_l0_d`dec'_lag`lg'=sil_expi[_n-`lg']*2^(-`lg'/`dec')

	bysort id (t): gen siliint_l0_d`dec'_lag`lg'=sil_expi_int[_n-`lg']*2^(-`lg'/`dec')

	}

	egen silcumi_l0_d`dec'=rowtotal(sili_l0_d`dec'_lag*)
	egen silcuminti_l0_d`dec'=rowtotal(siliint_l0_d`dec'_lag*)

	drop sili_l0_d`dec'_lag* siliint_l0_d`dec'_lag* 

	**generate version with latency and decaying exposure - note lat=0 already created

	**gen latency=10log for user defined decay value
	bysort id (t): replace silcumi_base=silcumi_l0_d`dec'[_n-loglatency_sil]  if `decay'!=0	
	replace silcumi_base=0 if t<loglatency_sil & `decay'!=0

	bysort id (t): replace silcumi_int=silcuminti_l0_d`dec'[_n-loglatency_sil] if `decay'!=0	
	replace silcumi_int=0 if t<loglatency_sil & `decay'!=0


	}


	label var silcumi_base "Cumulative silica exposure (30% ave reduction 2010+) lat=0 decay=0 indep corr"
	label var silcumi_int "Cumulative silica exposure (30% ave reduction 2010+) lat=10 decay=0 indep corr"

	


	***************DEFINE BASELINE RISK AT EACH AGE**************
	**I.E. RISK OF LUNG CANCER IN MALE NON-SMOKERS
	**follows age group risk of lung cancer as reported by cancer reserach uk
	**adjusted such that 10% of risk for males and 15% for females representing
	**non_smokers (as indicated by UK biobank)

	gen base=. 
	label var base "Baseline risk in non-smokers by age and gender"
	**males
	replace base=0.000002 if age>=14 & age<=19 & sex==0
	replace base=0.000002 if age>=20 & age<=24 & sex==0
	replace base=0.000007 if age>=25 & age<=29 & sex==0
	replace base=0.00001 if age>=30 & age<=34 & sex==0
	replace base=0.000021 if age>=35 & age<=39 & sex==0
	replace base=0.0000053 if age>=40 & age<=44 & sex==0
	replace base=0.0000143 if age>=45 & age<=49 & sex==0
	replace base=0.0000335 if age>=50 & age<=54 & sex==0
	replace base=0.0000671 if age>=55 & age<=59 & sex==0
	replace base=0.000134 if age>=60 & age<=64 & sex==0
	replace base=0.0002276 if age>=65 & age<=69 & sex==0
	replace base=0.0003116 if age>=70 & age<=74 & sex==0
	replace base=0.0004489 if age>=75 & age<=79 & sex==0
	replace base=0.0004745 if age>=80 & age<=84 & sex==0
	replace base=0.0005139 if age>=85 & age<=89 & sex==0
	replace base=0.0004891 if age>=90 & age<. & sex==0

	**females
	replace base=0.000004 if age>=14 & age<=19 & sex==1
	replace base=0.000004 if age>=20 & age<=24 & sex==1
	replace base=0.000009 if age>=25 & age<=29 & sex==1
	replace base=0.000014 if age>=30 & age<=34 & sex==1
	replace base=0.000048 if age>=35 & age<=39 & sex==1
	replace base=0.000021 if age>=40 & age<=44 & sex==1
	replace base=0.0000456 if age>=45 & age<=49 & sex==1
	replace base=0.0000984 if age>=50 & age<=54 & sex==1
	replace base=0.0001815 if age>=55 & age<=59 & sex==1
	replace base=0.00028455 if age>=60 & age<=64 & sex==1
	replace base=0.00039405 if age>=65 & age<=69 & sex==1
	replace base=0.00048615 if age>=70 & age<=74 & sex==1
	replace base=0.00047205 if age>=75 & age<=79 & sex==1
	replace base=0.00048915 if age>=80 & age<=84 & sex==1
	replace base=0.0003825 if age>=85 & age<=89 & sex==1
	replace base=0.0003825 if age>=90 & age<. & sex==1


	********SIMULATE A YEAR OF DEATH USING RISK FROM LIFE TABLES***********
	***Using mid point 1990 life-table as impractical to use life tables from every year**

	gen death_YN=0

	**Simulate death for males
	replace death_YN=1 if age==14 & sex==0 & uniform()<=0.000297
	replace death_YN=1 if age==15 & sex==0 & uniform()<=0.000414
	replace death_YN=1 if age==16 & sex==0 & uniform()<=0.000537
	replace death_YN=1 if age==17 & sex==0 & uniform()<=0.000771
	replace death_YN=1 if age==18 & sex==0 & uniform()<=0.000899
	replace death_YN=1 if age==19 & sex==0 & uniform()<=0.000867
	replace death_YN=1 if age==20 & sex==0 & uniform()<=0.000894
	replace death_YN=1 if age==21 & sex==0 & uniform()<=0.000914
	replace death_YN=1 if age==22 & sex==0 & uniform()<=0.000933
	replace death_YN=1 if age==23 & sex==0 & uniform()<=0.000945
	replace death_YN=1 if age==24 & sex==0 & uniform()<=0.000912
	replace death_YN=1 if age==25 & sex==0 & uniform()<=0.000896
	replace death_YN=1 if age==26 & sex==0 & uniform()<=0.000948
	replace death_YN=1 if age==27 & sex==0 & uniform()<=0.000907
	replace death_YN=1 if age==28 & sex==0 & uniform()<=0.000924
	replace death_YN=1 if age==29 & sex==0 & uniform()<=0.000972
	replace death_YN=1 if age==30 & sex==0 & uniform()<=0.000955
	replace death_YN=1 if age==31 & sex==0 & uniform()<=0.001034
	replace death_YN=1 if age==32 & sex==0 & uniform()<=0.001042
	replace death_YN=1 if age==33 & sex==0 & uniform()<=0.001044
	replace death_YN=1 if age==34 & sex==0 & uniform()<=0.0011
	replace death_YN=1 if age==35 & sex==0 & uniform()<=0.001221
	replace death_YN=1 if age==36 & sex==0 & uniform()<=0.001313
	replace death_YN=1 if age==37 & sex==0 & uniform()<=0.001425
	replace death_YN=1 if age==38 & sex==0 & uniform()<=0.001565
	replace death_YN=1 if age==39 & sex==0 & uniform()<=0.001686
	replace death_YN=1 if age==40 & sex==0 & uniform()<=0.001719
	replace death_YN=1 if age==41 & sex==0 & uniform()<=0.001959
	replace death_YN=1 if age==42 & sex==0 & uniform()<=0.002096
	replace death_YN=1 if age==43 & sex==0 & uniform()<=0.002249
	replace death_YN=1 if age==44 & sex==0 & uniform()<=0.002405
	replace death_YN=1 if age==45 & sex==0 & uniform()<=0.002735
	replace death_YN=1 if age==46 & sex==0 & uniform()<=0.003202
	replace death_YN=1 if age==47 & sex==0 & uniform()<=0.003384
	replace death_YN=1 if age==48 & sex==0 & uniform()<=0.003796
	replace death_YN=1 if age==49 & sex==0 & uniform()<=0.004292
	replace death_YN=1 if age==50 & sex==0 & uniform()<=0.004857
	replace death_YN=1 if age==51 & sex==0 & uniform()<=0.005395
	replace death_YN=1 if age==52 & sex==0 & uniform()<=0.005896
	replace death_YN=1 if age==53 & sex==0 & uniform()<=0.006659
	replace death_YN=1 if age==54 & sex==0 & uniform()<=0.00723
	replace death_YN=1 if age==55 & sex==0 & uniform()<=0.008251
	replace death_YN=1 if age==56 & sex==0 & uniform()<=0.009097
	replace death_YN=1 if age==57 & sex==0 & uniform()<=0.010231
	replace death_YN=1 if age==58 & sex==0 & uniform()<=0.011466
	replace death_YN=1 if age==59 & sex==0 & uniform()<=0.01267
	replace death_YN=1 if age==60 & sex==0 & uniform()<=0.01425
	replace death_YN=1 if age==61 & sex==0 & uniform()<=0.016041
	replace death_YN=1 if age==62 & sex==0 & uniform()<=0.017832
	replace death_YN=1 if age==63 & sex==0 & uniform()<=0.020221
	replace death_YN=1 if age==64 & sex==0 & uniform()<=0.022462
	replace death_YN=1 if age==65 & sex==0 & uniform()<=0.025026
	replace death_YN=1 if age==66 & sex==0 & uniform()<=0.027466
	replace death_YN=1 if age==67 & sex==0 & uniform()<=0.030879
	replace death_YN=1 if age==68 & sex==0 & uniform()<=0.033414
	replace death_YN=1 if age==69 & sex==0 & uniform()<=0.036941
	replace death_YN=1 if age==70 & sex==0 & uniform()<=0.039599
	replace death_YN=1 if age==71 & sex==0 & uniform()<=0.043787
	replace death_YN=1 if age==72 & sex==0 & uniform()<=0.048007
	replace death_YN=1 if age==73 & sex==0 & uniform()<=0.05314
	replace death_YN=1 if age==74 & sex==0 & uniform()<=0.058194
	replace death_YN=1 if age==75 & sex==0 & uniform()<=0.062153
	replace death_YN=1 if age==76 & sex==0 & uniform()<=0.068562
	replace death_YN=1 if age==77 & sex==0 & uniform()<=0.074933
	replace death_YN=1 if age==78 & sex==0 & uniform()<=0.081748
	replace death_YN=1 if age==79 & sex==0 & uniform()<=0.08944
	replace death_YN=1 if age==80 & sex==0 & uniform()<=0.098259
	replace death_YN=1 if age==81 & sex==0 & uniform()<=0.105381
	replace death_YN=1 if age==82 & sex==0 & uniform()<=0.115263
	replace death_YN=1 if age==83 & sex==0 & uniform()<=0.125567
	replace death_YN=1 if age==84 & sex==0 & uniform()<=0.136785
	replace death_YN=1 if age==85 & sex==0 & uniform()<=0.146158
	replace death_YN=1 if age==86 & sex==0 & uniform()<=0.158659
	replace death_YN=1 if age==87 & sex==0 & uniform()<=0.171571
	replace death_YN=1 if age==88 & sex==0 & uniform()<=0.181549
	replace death_YN=1 if age==89 & sex==0 & uniform()<=0.196972
	replace death_YN=1 if age==90 & sex==0 & uniform()<=0.208406
	replace death_YN=1 if age==91 & sex==0 & uniform()<=0.22292
	replace death_YN=1 if age==92 & sex==0 & uniform()<=0.240461
	replace death_YN=1 if age==93 & sex==0 & uniform()<=0.259515
	replace death_YN=1 if age==94 & sex==0 & uniform()<=0.28188
	replace death_YN=1 if age==95 & sex==0 & uniform()<=0.298551
	replace death_YN=1 if age==96 & sex==0 & uniform()<=0.318925
	replace death_YN=1 if age==97 & sex==0 & uniform()<=0.330975
	replace death_YN=1 if age==98 & sex==0 & uniform()<=0.342498
	replace death_YN=1 if age==99 & sex==0 & uniform()<=0.381219
	replace death_YN=1 if age>=100 & sex==0 & uniform()<=0.396158

	**repeat for females
	replace death_YN=1 if age==14 & sex==1 & uniform()<=0.000209
	replace death_YN=1 if age==15 & sex==1 & uniform()<=0.000208
	replace death_YN=1 if age==16 & sex==1 & uniform()<=0.00026
	replace death_YN=1 if age==17 & sex==1 & uniform()<=0.000322
	replace death_YN=1 if age==18 & sex==1 & uniform()<=0.000301
	replace death_YN=1 if age==19 & sex==1 & uniform()<=0.000327
	replace death_YN=1 if age==20 & sex==1 & uniform()<=0.000307
	replace death_YN=1 if age==21 & sex==1 & uniform()<=0.000325
	replace death_YN=1 if age==22 & sex==1 & uniform()<=0.000325
	replace death_YN=1 if age==23 & sex==1 & uniform()<=0.000326
	replace death_YN=1 if age==24 & sex==1 & uniform()<=0.000337
	replace death_YN=1 if age==25 & sex==1 & uniform()<=0.000338
	replace death_YN=1 if age==26 & sex==1 & uniform()<=0.000356
	replace death_YN=1 if age==27 & sex==1 & uniform()<=0.000373
	replace death_YN=1 if age==28 & sex==1 & uniform()<=0.00041
	replace death_YN=1 if age==29 & sex==1 & uniform()<=0.000408
	replace death_YN=1 if age==30 & sex==1 & uniform()<=0.000402
	replace death_YN=1 if age==31 & sex==1 & uniform()<=0.000483
	replace death_YN=1 if age==32 & sex==1 & uniform()<=0.000545
	replace death_YN=1 if age==33 & sex==1 & uniform()<=0.000566
	replace death_YN=1 if age==34 & sex==1 & uniform()<=0.000638
	replace death_YN=1 if age==35 & sex==1 & uniform()<=0.000719
	replace death_YN=1 if age==36 & sex==1 & uniform()<=0.000763
	replace death_YN=1 if age==37 & sex==1 & uniform()<=0.00083
	replace death_YN=1 if age==38 & sex==1 & uniform()<=0.000918
	replace death_YN=1 if age==39 & sex==1 & uniform()<=0.001036
	replace death_YN=1 if age==40 & sex==1 & uniform()<=0.001067
	replace death_YN=1 if age==41 & sex==1 & uniform()<=0.001187
	replace death_YN=1 if age==42 & sex==1 & uniform()<=0.001372
	replace death_YN=1 if age==43 & sex==1 & uniform()<=0.00137
	replace death_YN=1 if age==44 & sex==1 & uniform()<=0.001635
	replace death_YN=1 if age==45 & sex==1 & uniform()<=0.001827
	replace death_YN=1 if age==46 & sex==1 & uniform()<=0.002023
	replace death_YN=1 if age==47 & sex==1 & uniform()<=0.002262
	replace death_YN=1 if age==48 & sex==1 & uniform()<=0.00249
	replace death_YN=1 if age==49 & sex==1 & uniform()<=0.002679
	replace death_YN=1 if age==50 & sex==1 & uniform()<=0.003045
	replace death_YN=1 if age==51 & sex==1 & uniform()<=0.00336
	replace death_YN=1 if age==52 & sex==1 & uniform()<=0.00373
	replace death_YN=1 if age==53 & sex==1 & uniform()<=0.003972
	replace death_YN=1 if age==54 & sex==1 & uniform()<=0.004347
	replace death_YN=1 if age==55 & sex==1 & uniform()<=0.004872
	replace death_YN=1 if age==56 & sex==1 & uniform()<=0.005458
	replace death_YN=1 if age==57 & sex==1 & uniform()<=0.006206
	replace death_YN=1 if age==58 & sex==1 & uniform()<=0.006741
	replace death_YN=1 if age==59 & sex==1 & uniform()<=0.007583
	replace death_YN=1 if age==60 & sex==1 & uniform()<=0.008577
	replace death_YN=1 if age==61 & sex==1 & uniform()<=0.009591
	replace death_YN=1 if age==62 & sex==1 & uniform()<=0.010372
	replace death_YN=1 if age==63 & sex==1 & uniform()<=0.01155
	replace death_YN=1 if age==64 & sex==1 & uniform()<=0.012993
	replace death_YN=1 if age==65 & sex==1 & uniform()<=0.014437
	replace death_YN=1 if age==66 & sex==1 & uniform()<=0.015411
	replace death_YN=1 if age==67 & sex==1 & uniform()<=0.017252
	replace death_YN=1 if age==68 & sex==1 & uniform()<=0.018839
	replace death_YN=1 if age==69 & sex==1 & uniform()<=0.020663
	replace death_YN=1 if age==70 & sex==1 & uniform()<=0.022368
	replace death_YN=1 if age==71 & sex==1 & uniform()<=0.024247
	replace death_YN=1 if age==72 & sex==1 & uniform()<=0.02753
	replace death_YN=1 if age==73 & sex==1 & uniform()<=0.031026
	replace death_YN=1 if age==74 & sex==1 & uniform()<=0.033343
	replace death_YN=1 if age==75 & sex==1 & uniform()<=0.036272
	replace death_YN=1 if age==76 & sex==1 & uniform()<=0.039841
	replace death_YN=1 if age==77 & sex==1 & uniform()<=0.044453
	replace death_YN=1 if age==78 & sex==1 & uniform()<=0.049033
	replace death_YN=1 if age==79 & sex==1 & uniform()<=0.054551
	replace death_YN=1 if age==80 & sex==1 & uniform()<=0.060603
	replace death_YN=1 if age==81 & sex==1 & uniform()<=0.067124
	replace death_YN=1 if age==82 & sex==1 & uniform()<=0.074292
	replace death_YN=1 if age==83 & sex==1 & uniform()<=0.082263
	replace death_YN=1 if age==84 & sex==1 & uniform()<=0.090685
	replace death_YN=1 if age==85 & sex==1 & uniform()<=0.100754
	replace death_YN=1 if age==86 & sex==1 & uniform()<=0.112211
	replace death_YN=1 if age==87 & sex==1 & uniform()<=0.122616
	replace death_YN=1 if age==88 & sex==1 & uniform()<=0.132504
	replace death_YN=1 if age==89 & sex==1 & uniform()<=0.147101
	replace death_YN=1 if age==90 & sex==1 & uniform()<=0.163633
	replace death_YN=1 if age==91 & sex==1 & uniform()<=0.178249
	replace death_YN=1 if age==92 & sex==1 & uniform()<=0.194666
	replace death_YN=1 if age==93 & sex==1 & uniform()<=0.211488
	replace death_YN=1 if age==94 & sex==1 & uniform()<=0.227664
	replace death_YN=1 if age==95 & sex==1 & uniform()<=0.251821
	replace death_YN=1 if age==96 & sex==1 & uniform()<=0.26858
	replace death_YN=1 if age==97 & sex==1 & uniform()<=0.284927
	replace death_YN=1 if age==98 & sex==1 & uniform()<=0.293256
	replace death_YN=1 if age==99 & sex==1 & uniform()<=0.324548
	replace death_YN=1 if age>=100 & sex==1 & uniform()<=0.337119



	**replace each year post first death as death
	bysort id (t): replace death_YN=1 if t>1 & death_YN[_n-1]==1
	label var death_YN "indicator of when if they have died"
	label define death_YN 0 "Alive" 1 "Death", replace 
	label values death_YN death_YN

	**gen year of death
	gen year_of_death=year if death_YN==1
	gsort id -death_YN t
	bysort id: replace year_of_death=year_of_death[_n-1] if _n>1 
	label var year_of_death "Year death occurred based in lifetables"

	**gen age of death
	gen age_of_death=age if death_YN==1
	gsort id -death_YN t
	bysort id: replace age_of_death=age_of_death[_n-1] if _n>1 
	label var age_of_death "Age death occurred based on lifetables"

	**drop years post death
	bysort id: drop if year>year_of_death

	***Count number alive at each year
	bysort year: gegen year_alive_n = count(death_YN)
	label var year_alive_n "Number still alive at start of year"

	***Count number deaths at each year
	bysort year: gegen year_deaths_n = count(death_YN) if death_YN==1
	gsort year -death_YN t
	by year: replace year_deaths_n=year_deaths_n[_n-1] if _n>1
	label var year_deaths_n "Number died during year"

	***Count number alive at each age grp
	***Count number alive at each year
	bysort age: gegen age_alive_n = count(death_YN)
	label var age_alive_n "Number still alive at start of each age"

	***Count number deaths at each year
	bysort age: gegen age_deaths_n = count(death_YN) if death_YN==1
	gsort age -death_YN t
	by age: replace age_deaths_n=age_deaths_n[_n-1] if _n>1
	label var age_deaths_n "Number died during each age"


	********************************************************
	**SIMULATE LUNG CANCER FOR SCENARIOS
	********************************************************


	*****WITHOUT INTERVENTIONS

	**Model 0 - silica only no smoking 
	bysort id (t):generate byte LC0= rbinomial(1, invlogit(logit(base) + ln(1.3)*sex + ln(`sileff')*silcumi_base))
	bysort id (t):generate byte LC0a= rbinomial(1, invlogit(logit(base) + ln(1.3)*sex + ln(`diseff')*dieselcumi_base + ln(`sileff')*silcumi_base))


	**BACKGROUND - smoking only model approx general population 
	bysort id (t):generate byte LC1= rbinomial(1, invlogit(logit(base) + ln(1.3)*sex + ln(`smkeff')*pack_yearsi_base))


	**BASE Model - No inteventions - smoking + silica + diesel exposure + interaction between sil and dis (note if set to 1 i.e. no effect/interaction so term dropped)
	bysort id (t):generate byte LC_base= rbinomial(1, invlogit(logit(base) + ln(1.3)*sex + ln(`smkeff')*pack_yearsi_base + ln(`diseff')*dieselcumi_base + ln(`sileff')*silcumi_base + ln(`interact')*(dieselcumi_base*silcumi_base)))


	*****WITH INTERVENTIONS

	***SILICA &/OR DIESEL INTERVENTION EFFECT AVE/MAX REDUCED BY ??%

	**Intevention Model - smoking + silica + diesel exposure + interaction between sil and dis (note if set to 1 i.e. no effect/interaction so term dropped)
	bysort id (t):generate byte LC_int= rbinomial(1, invlogit(logit(base) + ln(1.3)*sex + ln(`smkeff')*pack_yearsi_base + ln(`diseff')*dieselcumi_int + ln(`sileff')*silcumi_int + ln(`interact')*(dieselcumi_int*silcumi_int)))


	
	label define LC 0 "No/Pre-LC" 1 "LC" 2 "Post LC" , replace 

	*keep id t LC* year age year_of_death 
	**drop years post death
	bysort id (year): drop if year>year_of_death


	sort id t	
	foreach var of varlist LC* {
	**identify those with LC
	di "`var'"
	by id: replace `var'=2 if t>1 & `var'[_n-1]==1
	by id: replace `var'=2 if t>1 & `var'[_n-1]==2
	label values `var' LC, 

	**calculate the number of working years life lost in those of working age with LC
	**assume expected working life is 65
	gen wkyll`var'=`retage'-age if `var'==1 & age<65 


	}	


	****STORING OUTCOMES*** 
	****************************

	**Repeat for when participants enter work all (1940), 10 years pre intervention (2000), and at start of intervention (2010)
	**modify restrictions e.g. keep if entered work in 1950 1990 2010
	**follow up 60 years post intervention (note last new entry is 2060)
	keep if year<=2100
	**identify the LC scenarios required (all here)
	keep year age LC* wkyll*

	quietly foreach var of varlist LC* {
		di "`var'"
	**calc number LC free at start of each year (id if there were LC free)
		gen ind_n`var'=1 if `var'<=1
	**Create vars to describe represent age at LC
		gen m_age`var'=age if `var'==1
		gen sd_age`var'=age if `var'==1
		gen p50_age`var'=age if `var'==1
		gen p25_age`var'=age if `var'==1
		gen p75_age`var'=age if `var'==1
	**keep working years of life lost at year LC occurrred to total
		replace wkyll`var'=. if `var'!=1
	**replace years post LC occrred with 0 in indicator (for calc of total LC per year)
		replace `var'=0 if `var'==2
		}

	**gen number entering study each year (not just still alive) i.e. Once 14 join the cohort
	gen ind_join=1 if age==14
		
	**collapse and calc annual summaries for each measure	
	collapse (sum) LC* ind_n* wkyll* ind_join (mean) m_age* (sd) sd_age* (p50) p50_age* (p25) p25_age* (p75) p75_age* , by(year)

	**generate simulation number for later merge
	gen sim_no=`r'

	**append to results file
	append using "`save'/LC_datafile"

	**save results
	save "`save'/LC_datafile", replace
	noisily display in yellow "`r'," _c
	}
	noisily display in yellow " " _c
	timer off 1
	noisily display in yellow " " _c
	noisily di "Results of time = "
	timer list
	noisily di "Finish Date/Time = $S_DATE $S_TIME"


	
****LATER SUMMARY STATS - FIRST NEED TO MERGE DATA


**collapse to annual data of LC cases and number alive at start of year
	quietly foreach var of varlist LC* m_age* wkyll* {
		gen se`var'=`var'
	}
	
	collapse (sum) LC* ind_n* wkyll* (mean) m_age* p50_age* p25_age* p75_age* (sd) sem_age* seLC* sewkyll*, by(year)

**generate incidence rate & working years of life lost per 100000 for each year
	quietly foreach var of varlist LC* {
		gen inc`var'=(`var'/ind_n`var')*100000
		replace wkyll`var'=(wkyll`var'/ind_n`var')*100000
	}

	order year LC* incLC* seLC* ind* m_age* p50* p25* p75* wkyll* sem_age* 

	qui keep if year<=2070 & year>1940

**gen cumulative wrkyll since intervention year
	quietly foreach var of varlist incLC* wkyll* {
		gen cu`var'= `var' if year==`year'
		replace cu`var'=cu`var'[_n-1]+`var' if year>`year'
	}

	
**Gen confidence intervals on annual incidence rates
	quietly foreach var of varlist LC* {
		gen lc`var'=`var'-1.96*se`var'
		replace lc`var'=(lc`var'/ind_n`var')*100000
		gen uc`var'=`var'+1.96*se`var'
		replace uc`var'=(uc`var'/ind_n`var')*100000
	}

**Gen confidence intervals on mean age of LC diagnosis

	quietly foreach var of varlist m_age* {
		gen lc`var'=`var'-1.96*se`var'
		gen uc`var'=`var'+1.96*se`var'
	}
	
gen year_int = year-`year'	
label var year_int "Year since Intervention occurred"	

***Smoking plus SILICA 25% effect, no decay, 5, 15 year decay  
****************************************************************************************

**generate lowes smooth of confidene intervals
foreach var of varlist incLC* ucLC* lcLC* {
	lowess `var' year, gen(smth_`var') bwidth(0.3) nograph 
}

**plot the incidence rates per year
qui twoway (lowess smth_incLC1 year_int, bwidth(0.3) legend(label(1 "Smk Only")) lcolor(black)) ///
(rarea smth_lcLC1 smth_ucLC1 year_int, fcolor(gray%50) legend(label(2 "Smk Only 95% CI")) lcolor(gray%5)) ///
(lowess smth_incLC_base year_int, bwidth(0.3) legend(label(3 "Baseline Model")) lcolor(green) lwidth(thick)) ///
(rarea smth_lcLC_base smth_ucLC_base year_int, fcolor(green%50) legend(label(4 "Baseline Model 95%CI")) lcolor(green%5)) ///
(lowess smth_incLC_int year_int, bwidth(0.3) legend(label(5 "Intervention Model"))  lpattern(dash) lcolor(red) lwidth(thick)) ///
(rarea smth_lcLC_int smth_ucLC_int year_int, fcolor(red%50) legend(label(6 "Intervention Model 95%CI")) lcolor(red%5)) if year >= 2000 & year <= 2060 ///
, xline(0) xtitle("Year Since Intervantion") ytitle("Lung Cancer Inc per 100,000") xlabel(0(10)50) xscale(range(0 50)) saving("`save'/LCincplot", replace) legend(pos(2) ring(0))

**plot the cumulative working years of life lost per year
qui twoway (lowess cuwkyllLC1 year_int, bwidth(0.3) legend(off) lcolor(black)) ///
(lowess cuwkyllLC_base year_int, bwidth(0.3) legend(off)  lcolor(green) lwidth(thick)) ///
(lowess cuwkyllLC_int year_int, bwidth(0.3) legend(off)   lpattern(dash) lcolor(red) lwidth(thick)) if year >= 2000 & year <= 2060 ///
, xline(0) xtitle("Year Since Intervantion") ytitle("Cumulative Working Years of Life Lost per 100,000 pys") xlabel(0(10)50) xscale(range(0 50)) saving("`save'/cwkyllplot", replace)


graph combine "`save'/LCincplot.gph" "`save'/cwkyllplot.gph"


qui label var incLC1 "Inc per 100000pys (smoking only)"
qui label var incLC_base "Inc per 100000pys (base model)"
qui label var incLC_int "Inc per 100000pys (intervention model)"
qui label var m_ageLC1 "Mean age (smoking only)"
qui label var m_ageLC_base "Mean age (base model)"
qui label var m_ageLC_int "Mean age (intervention model)"
qui label var p50_ageLC1 "Median age (smoking only)"
qui label var p50_ageLC_base "Median age (base model)"
qui label var p50_ageLC_int "Median age (intervention model)"
qui label var p25_ageLC1 "25th Percentile (smoking only)"
qui label var p25_ageLC_base "25th Percentile (base model)"
qui label var p25_ageLC_int "25th Percentile (smoking only)"
qui label var p75_ageLC1 "75th Percentile (smoking only)"
qui label var p75_ageLC_base "75th Percentile (base model)"
qui label var p75_ageLC_int "75th Percentile (intervention model)"
qui label var wkyllLC1 "Working Years of life lost (smoking only)"
qui label var wkyllLC_base "Working Years of life lost (base model)"
qui label var wkyllLC_int "Working Years of life lost (intervention model)"
qui label var cuwkyllLC1 "Cumulative Working Years of life lost (smoking only)"
qui label var cuwkyllLC_base "Cumulative Working Years of life lost (base model)"
qui label var cuwkyllLC_int "Cumulative Working Years of life lost (intervention model)"
qui label var lcLC1 "95%LCI Inc per 100000pys (smoking only)"
qui label var ucLC1 "95%UCI Inc per 100000pys (smoking only)"
qui label var lcLC_base "95%LCI Inc per 100000pys (base model)"
qui label var ucLC_base "95%UCI Inc per 100000pys (base model)"
qui label var lcLC_int "95%LCI Inc per 100000pys (intervention model)"
qui label var ucLC_int "95%UCI Inc per 100000pys (intervention model)"
qui label var cuincLC1 "Cumulative Inc per 100000pys (smoking only)"
qui label var cuincLC_base "Cumulative Inc per 100000pys (base model)"
qui label var cuincLC_int "Cumulative Inc per 100000pys (intervention model)"
qui label var lcm_ageLC1 "95%LCI Inc per 100000pys (smoking only)"
qui label var ucm_ageLC1 "95%UCI Inc per 100000pys (smoking only)"
qui label var lcm_ageLC_base "95%LCI Inc per 100000pys (base model)"
qui label var ucm_ageLC_base "95%UCI Inc per 100000pys (base model)"
qui label var lcm_ageLC_int "95%LCI Inc per 100000pys (intervention model)"
qui label var ucm_ageLC_int "95%UCI Inc per 100000pys (intervention model)"

qui label var LC1 "Raw count of new cases of LC smoking only model"
qui label var LC_base "Raw count of new cases of LC base model"
qui label var LC_int "Raw count of new cases of LC intervention model"
qui label var seLC1 "Standard Error for Incidence Rate smokng only model"
qui label var seLC_base "Standard Error for Incidence Rate base model"
qui label var seLC_int "Standard Error for Incidence Rate intervention model"
qui label var ind_nLC1 "Number of health participants at start of year smokng only model"
qui label var ind_nLC_base "Number of health participants at start of year base model"
qui label var ind_nLC_int "Number of health participants at start of year intervention model"
qui label var sem_ageLC1 "Standard Error for mean age at LC diagnosis smokng only model"
qui label var sem_ageLC_base "Standard Error for mean age at LC diagnosis base  model"
qui label var sem_ageLC_int "Standard Error for mean age at LC diagnosis intervention model"
qui label var sewkyllLC1 "Standard Error for working years of life lost diagnosis smokng only model"
qui label var sewkyllLC_base "Standard Error for working years of life lost diagnosis base model"
qui label var sewkyllLC_int "Standard Error for working years of life lost diagnosis intervention model"
qui label var smth_incLC1 "LOWESS smoothed incidence rate for annual plot smokng only model"
qui label var smth_incLC_base "LOWESS smoothed incidence rate for annual plot base model"
qui label var smth_incLC_int "LOWESS smoothed incidence rate for annual plot intervention model"
qui label var smth_ucLC1 "LOWESS smoothed incidence rate for 95% upper confidence interval annual plot smokng only model"
qui label var smth_ucLC_base "LOWESS smoothed incidence rate for 95% upper confidence interval annual plot base model"
qui label var smth_ucLC_int "LOWESS smoothed incidence rate 95% upper confidence interval for annual plot interventiomodel"
qui label var smth_lcLC1 "LOWESS smoothed incidence rate for 95% lower confidence interval annual plot smokng only model"
qui label var smth_lcLC_base "LOWESS smoothed incidence rate 95% lower confidence interval for annual plot base model"
qui label var smth_lcLC_int "LOWESS smoothed incidence rate 95% lower confidence interval for annual plot intervention model"

di "*******"
di "RESULTS"
di "*******"
di ""
di ""
di "Base = Model as Defined by user"
di "Intervention = Model with intervention applied"
di ""
di ""
di "Inicidence Rates per 100000 person years in future forecast years post intervention"
table () year_int if year_int==0 | year_int==10 | year_int==20 | year_int==30 | year_int==40 | year_int==50, stat(min incLC1 lcLC1 ucLC1 incLC_base lcLC_base ucLC_base incLC_int lcLC_int ucLC_int cuincLC1 cuincLC_base cuincLC_int) nototals nformat(%9.1f)

di ""
di "Mean age of LC diagnosis in future forecast years post intervention"
table () year_int if year_int==0 | year_int==10 | year_int==20 | year_int==30 | year_int==40 | year_int==50, stat(min  m_ageLC1 lcm_ageLC1 ucm_ageLC1 m_ageLC_base lcm_ageLC_base ucm_ageLC_base m_ageLC_int lcm_ageLC_int ucm_ageLC_int) nototals nformat(%9.1f)


di ""
di "Median (& IQR) for age of LC diagnosis for future forecast years post intervention"
table () year_int if year_int==0 | year_int==10 | year_int==20 | year_int==30 | year_int==40 | year_int==50, stat(min p50_ageLC1 p25_ageLC1 p75_ageLC1 p50_ageLC_base p25_ageLC_base p75_ageLC_base  p50_ageLC_int p25_ageLC_int p75_ageLC_int) nototals nformat(%9.1f)

di ""
di "Working Years of Life List in future forecast years post intervention"
table () year_int if year_int==0 | year_int==10 | year_int==20 | year_int==30 | year_int==40 | year_int==50, stat(min wkyllLC1 wkyllLC_base wkyllLC_int cuwkyllLC1 cuwkyllLC_base cuwkyllLC_int) nototals nformat(%9.1f) 


keep year *LC1* *LC_base* *LC_int*

di "Note, data file for annual incidence rates, and plot of incidence rates & cumulative working years of life lost by year saved user defined location"



	end	
		
