-
Notifications
You must be signed in to change notification settings - Fork 0
/
CreateKinshipMatrix.R
executable file
·212 lines (186 loc) · 6.33 KB
/
CreateKinshipMatrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env Rscript
#
# Script: CreateKinshipMatrix.R
# Purpose: Convert table of family identies twin status into a "kinship" matrix
# Author: Thomas Nichols
# Version: http://github.com/nicholst/HCPutils/tree/$Format:%h$
# $Format:%ci$
#
Usage='
Usage: CreateKinshipMatrix.R In.csv Out.csv [FamIDs.csv]
In.csv is plain CSV file, with the first row containing the
variable names, having at least the variable names: "Subject",
"Mother_ID", "Father_ID", "ZygositySR" and, "ZygosityGT".
Zygosity is drawn from ZygosityGT preferentially, and then as backup
from ZygositySR.
Out.csv will be a nSubject by nSubject matrix, with row and column
names set by the variable "Subject", with values:
-1 Self
0 Unrelated
1 MZ twins
2 DZ twins
3 Full siblings (FS)
4 Half siblings (HS)
If FamIDs.csv is specified, a table with two versions of a family ID
will be saved. "FamID" is the variable formed by the appending of
Mother_ID and Father_ID for each subject. "ExtFamID" is an identifier
for an extended family, consisting of sorted IDs of all parents of
children in the extended family.
NOTE: Assumes there is no more than one twin pair per family.
'
## Algorithm:
##
## For the HCP, a "family" is a set of individuals for which each pair of
## individuals has some relationship, either DZ, MZ, FS or HS; we also
## require that no relationships exist between families.
##
## Families can be identified by a two-step process.
##
## 1. Identify all mothers.
## For each mother:
## a. Define all children of a mother tentatively to be "a family".
## b. Create a family ID from the list comprised of this mother
## and all fathers of her children.
## c. For each father, increment the "FamiliesFathered" variable,
## to keep track of how many families each father is a part of.
##
## 2. Process fathers appearing in 2 or more families (fathers
## appearing in exactly 1 family cannot create extended families)
## For each such father:
## a. Find all children sired by this father.
## b. Iteratively search for more parents: Find all parents of
## these children, then find all children of those parents,
## then find all parents of those children, etc; repeat until
## the list of parents stops growing.
## b. Create a new single family ID consisting of the union of
## all mother and father found, and finally assign to all
## children found.
args = commandArgs(TRUE)
if (length(args)<2 || length(args)>3)
stop(Usage)
InFn = args[[1]]
OutFn = args[[2]]
if (length(args)>2) {
FamFn = args[[3]]
} else {
FamFn = NULL
}
dat=read.csv(InFn)
nSubj=dim(dat)[1]
# Make unique identifier for each family
dat$FamID=factor(paste(dat$Mother_ID,dat$Father_ID,sep="_"))
MZ=DZ=FS=HS=matrix(0,ncol=nSubj,nrow=nSubj)
ExtFamID=vector("list",nSubj)
Zyg=as.character(dat$ZygosityGT)
# Fill in missing ZygosityGT values from ZygositySR
BadZ=!(Zyg=="MZ" | Zyg=="DZ")
Zyg[BadZ]=as.character(dat$ZygositySR[BadZ])
old=0
cat("Processing mothers ")
AllFath=c()
Blend=c()
for (moth in levels(factor(dat$Mother_ID))) {
cat(".")
Imoth = (dat$Mother_ID==moth)
Faths=c()
Fams=c()
for (fath in unique(as.character(dat$Father_ID[Imoth]))) {
fam=paste(moth,fath,sep="_")
Fams=c(Fams,fam)
Faths=c(Faths,fath)
AllFath=c(AllFath,fath)
Ifam=dat$FamID==fam
# Any family relationship
Ifam = (dat$FamID==fam)
FS = FS + Ifam%*%t(Ifam)
# MZ relationship
Ifam = (dat$FamID==fam) & (Zyg == "MZ")
MZ = MZ + Ifam%*%t(Ifam)
# DZ relationship
Ifam = (dat$FamID==fam) & ( (Zyg == "DZ")|(Zyg == "NotMZ") )
DZ = DZ + Ifam%*%t(Ifam)
}
# Extended family ID using all fathers
ExtFam=sort(c(moth,Faths))
for (i1 in which(Imoth))
ExtFamID[[i1]]=ExtFam
# Consider blended families
if (length(Fams)>1) {
for (i1 in 1:length(Fams)) {
fam1=Fams[i1]
Blend=c(Blend,which(dat$FamID==fam1))
if (i1<length(Fams)) {
for (i2 in (i1+1):length(Fams)) {
fam2 = Fams[i2]
Ifam1 = dat$FamID==fam1
Ifam2 = dat$FamID==fam2
HS = HS + Ifam1%*%t(Ifam2) + Ifam2%*%t(Ifam1)
}
}
}
}
}
cat("\nProcessing additional fathers ")
# Now do fathers that appear in 2 or more families not already caught above
DupFath=AllFath[duplicated(AllFath)]
for (fath in DupFath) {
cat(".")
ExtFam=c()
Ifath = (dat$Father_ID==fath)
Fams=unique(as.character(dat$FamID[Ifath]))
for (i1 in 1:length(Fams)) {
ii=which(dat$FamID==Fams[i1])[1] # All offspring in a family have same ExtFamID
ExtFam=c(ExtFam,ExtFamID[[ii]])
fam1=Fams[i1]
Blend=c(Blend,which(dat$FamID==fam1))
if (i1<length(Fams)) {
for (i2 in (i1+1):length(Fams)) {
fam2=Fams[i2]
Ifam1 = dat$FamID==fam1
Ifam2 = dat$FamID==fam2
HS = HS + Ifam1%*%t(Ifam2) + Ifam2%*%t(Ifam1)
}
}
}
ExtFam=sort(unique(ExtFam))
OrigExtFam=ExtFam
# Check for distant relations
nExtFamOld=0
while (length(ExtFam)>nExtFamOld) {
nExtFamOld=length(ExtFam)
# Find all offspring of this set of parents
Ioff = dat$Mother_ID%in%ExtFam | dat$Father_ID%in%ExtFam
# Find all parents of these offspring
ExtFam=sort(unique(c(dat$Mother_ID[Ioff],dat$Father_ID[Ioff])))
}
# Apply this new extended family ID to all subjects in the extended family
Ioff = dat$Mother_ID%in%ExtFam | dat$Father_ID%in%ExtFam
for (i in which(Ioff))
ExtFamID[[i]]=ExtFam
}
cat("\n")
# Do some nonintuitive arithmatic to get final matrix
Adj=FS*3-MZ*2-DZ
Adj=Adj-diag(diag(Adj))-1*diag(nSubj)
Adj=Adj+4*HS
# Add column names
colnames(Adj)<-as.character(dat$"Subject")
rownames(Adj)<-as.character(dat$"Subject")
write.csv(Adj,OutFn)
if (!is.null(FamFn)) {
# Create final extended Family ID
cat("Creating extended family ID's ")
tmp=character(nSubj)
for (eid in unique(ExtFamID)) {
cat(".")
tmp[ExtFamID%in%list(eid)] = paste(eid,collapse="_")
}
dat$ExtFamID=factor(tmp)
cat("\n")
write.csv(cbind(Subject=dat$Subject,FamID=as.character(dat$FamID),ExtFamID=as.character(dat$ExtFamID)),FamFn)
if (length(Blend)>0) {
cat("WARNING: Blended families\n")
print(cbind(dat[Blend,c("Subject","Mother_ID","Father_ID")],Zygosity=Zyg[Blend],ExtFamID=as.character(dat$ExtFamID[Blend])))
cat("\n")
}
}