-
Notifications
You must be signed in to change notification settings - Fork 0
/
Real_random.R
180 lines (140 loc) · 9.17 KB
/
Real_random.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
library( GenomicRanges)
library( tibble)
library( plyranges)
library( ggplot2)
library( dplyr)
Genes <- read.table( "/home/dankent/Data/HG_Esembl98_Genes.tab", header = T, sep = "\t")
Want <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X") # no TADs on the Y chromosome in the data
Genes <- subset(Genes, Genes$Chromosome.scaffold.name %in% Want)
Genes$Chromosome.scaffold.name <- paste0( 'chr',Genes$Chromosome.scaffold.name)
Genes$width <- Genes$Gene.end..bp.- Genes$Gene.start..bp.
Genes$mid_point <- (Genes$width/2)+Genes$Gene.start..bp.
HG <- Genes$HGNC.symbol
Lisa_genes <- as.factor(c("MYC", "BCL6", "NFKB2", "BCL3", "BCL11A", "CCND3", "BCL10", "CHST11", "LAPTM5",
"CEBPG", "BCL9","CEBPB", "DDX6","MALT1", "RHOH", "PAFAH1B2", "PCSK7", "EPOR", "CCND1", "FOXP1",
"CDK6", "FCGR2B", "PAX5", "IRF4", "MUC1", "BCL2", "CCND2", "ETV6", "MAF", "CRLF2",
"CEBPA", "CEBPD", "IL3", "CEBPE", "ID4", "NBEAP1", "LHX4", "FGFR3", "MAFB"))
NSD2 <- Genes[Genes$HGNC.symbol=="WHSC1",]
IGL <- Genes[Genes$Gene.start..bp.==22380474,]
Lisa_genes_match <- na.omit(match(Lisa_genes, HG))
Lisa_genes_match <- Genes[Lisa_genes_match,]
Lisa_genes_match <- rbind(Lisa_genes_match, NSD2, IGL)
Lisa_genes_match_DF <- Lisa_genes_match
Lisa_genes_match <- makeGRangesFromDataFrame(Lisa_genes_match, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
TADs_BI_GM12878 <- read.table("/home/dankent/Data/GM12878/TAD_domains/GSE63525_GM12878_primary+replicate_Arrowhead_domainlist.txt", header = TRUE)
TADs_BI_GM12878 <- TADs_BI_GM12878[ ,c(1,2,3,8)]
colnames(TADs_BI_GM12878) <- c("chromosome", "start", "end", "corner_score")
TADs_BI_GM12878$chromosome<- paste0( 'chr',TADs_BI_GM12878$chromosome)
TADs_process_for_overlap <- function( input_variable){
input_variable <- tibble::rowid_to_column( input_variable, "ID")
input_variable$widthTAD <- ( input_variable$end - input_variable$start)
input_variable$startTAD <- input_variable$start
input_variable$endTAD <- input_variable$end
makeGRangesFromDataFrame( input_variable, keep.extra.columns = TRUE, ignore.strand = TRUE)
}
TADs_GM12878 <- TADs_process_for_overlap(TADs_BI_GM12878)
TADs_GM12878 <- data.frame(reduce_ranges(TADs_GM12878))
TADs_GM12878$widthTAD <- ( TADs_GM12878$end - TADs_GM12878$start)
TADs_GM12878$startTAD <- TADs_GM12878$start
TADs_GM12878$endTAD <- TADs_GM12878$end
TADs_GM12878 <- tibble::rowid_to_column( TADs_GM12878, "ID")
TADs_GM12878 <- makeGRangesFromDataFrame( TADs_GM12878, keep.extra.columns = T )
##### get random genes and split into 500 chunks of 41 genes (same length as original) #####
Combined <- bind_rows(replicate(41, Genes %>% sample_n(1000,replace = T), simplify=F), .id="Obs")
Combined_GR <- makeGRangesFromDataFrame(Combined, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
Combined_nearest <- GenomicRanges::nearest(Combined_GR, TADs_GM12878)
Combined_nearest <- TADs_GM12878[Combined_nearest]
Combined_nearest_df <- data.frame(Combined_nearest)
GeneandTAD <- cbind(Combined, Combined_nearest_df)
GeneandTAD$distance <- apply(data.frame(abs(GeneandTAD$mid_point - GeneandTAD$startTAD), abs(GeneandTAD$mid_point - GeneandTAD$endTAD)),1,min)
Grouped <- split(GeneandTAD,GeneandTAD$Obs)
#numeric data change
test <- function(list){
list[,19] <- as.numeric(list[,19])
}
med_distance <- lapply(Grouped, test)
med_distance <- unlist(lapply(med_distance, median))
##### list method without replicate() #######
real_random_ranges <- sample_n(tbl = Genes, size = 20500)
real_random_ranges <- split(real_random_ranges, sample(rep(500)))
#since all data is now help in nested lists, I build fucntions with basic functions within, to apply across the whole list
#create Granges
real_random_ranges_to_GRanges_func <- function(list){
makeGRangesFromDataFrame(list, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
}
GR_real_random_ranges <- lapply(real_random_ranges,real_random_ranges_to_GRanges_func)
#find nearest TAD to each gene
nearest_for_list <- function(list){GenomicRanges::nearest(list, TADs_GM12878)}
near <- lapply(GR_real_random_ranges, nearest_for_list)
#identify the nearest TAD using the above
Tad_Extract <- function(list){
TADs_GM12878[list]
}
TAD_match_random <- lapply(near, Tad_Extract)
#create data frames
TAD_match_random <- lapply(TAD_match_random, data.frame)
real_random_ranges <- lapply(real_random_ranges, data.frame)
#extract important information
midpoint_all <- unlist(lapply(real_random_ranges, `[`, 8))
TADstart <- unlist(lapply(TAD_match_random, `[`, 8))
TADend <- unlist(lapply(TAD_match_random, `[`, 9))
#calculate the absolute distance and find the minimum distance (either TAD start of TAD end)
Distance_Random_mid_to_TAD <- data.frame(cbind(abs(midpoint_all - TADstart),abs(midpoint_all - TADend)))
Distance_Random_mid_to_TAD <- apply(Distance_Random_mid_to_TAD,1,min)
#mean average
median(Distance_Random_mid_to_TAD)
Distance_Random_mid_to_TAD_df <- data.frame(Distance_Random_mid_to_TAD)
############################################
### calculate distance for actual genes ####
############################################
TAD_match <- TADs_GM12878[nearest(Lisa_genes_match, TADs_GM12878)]
Distance_mid_to_TAD <- data.frame(cbind(abs(Lisa_genes_match$mid_point - TAD_match$startTAD),abs(Lisa_genes_match$mid_point - TAD_match$endTAD)))
Distance_mid_to_TAD <- apply(Distance_mid_to_TAD,1,min)
median(Distance_mid_to_TAD)
Distance_mid_to_TAD_df <- data.frame(Distance_mid_to_TAD)
Clean_up <- cbind(data.frame(Lisa_genes_match), Distance_mid_to_TAD_df, TAD_match)
Clean_up <- Clean_up[c(1:3,7,9:10,12,13)]
Clean_up %>% arrange(Distance_mid_to_TAD)
#### Visualise ####
# random - replicated 41 times for 1000 sample size (small error)
boxplot(Distance_mid_to_TAD_df$Distance_mid_to_TAD, med_distance, outline=FALSE,
ylab = "Distance from gene mid-point to TAD start/end", xlab = "IGH Partner Genes Random", main = "Distance from Gene to TAD boundary", col = c("#E69F00", "#56B4E9"))
#p-value
(sum(as.numeric(med_distance) < median(Distance_mid_to_TAD)))/1000
#random - entire dataset
boxplot(Distance_mid_to_TAD_df$Distance_mid_to_TAD, Distance_Random_mid_to_TAD, outline=FALSE,
ylab = "Distance from gene mid-point to TAD start/end", xlab = "IGH Partner Genes Random", main = "Distance from Gene to TAD boundary", col = c("#E69F00", "#56B4E9"))
####### all genes #####
Genes_GR <- makeGRangesFromDataFrame(df = Genes, keep.extra.columns = T, seqnames.field = "Chromosome.scaffold.name", start.field = "Gene.start..bp.", end.field = "Gene.end..bp.")
ALL_gene_TAD_match <- data.frame(TADs_GM12878[nearest(Genes_GR, TADs_GM12878)])
ALL_gene_TAD_match <- cbind(ALL_gene_TAD_match, Genes)
Distance_mid_to_TAD <- data.frame(cbind(abs(ALL_gene_TAD_match$mid_point - ALL_gene_TAD_match$startTAD),abs(ALL_gene_TAD_match$mid_point - ALL_gene_TAD_match$endTAD)))
Distance_mid_to_TAD <- apply(Distance_mid_to_TAD,1,min)
Distance_mid_to_TAD_df <- data.frame(Distance_mid_to_TAD)
Genes_match <- cbind(ALL_gene_TAD_match, Distance_mid_to_TAD_df)
test <- Genes_match %>%
select("seqnames", "start", "end", "Gene.stable.ID", "HGNC.symbol", "Chromosome.scaffold.name", "Gene.start..bp.", "Gene.end..bp.", "Gene.type", "mid_point", "Distance_mid_to_TAD") %>%
group_by(Distance_mid_to_TAD) %>%
arrange(Distance_mid_to_TAD)
test <- test %>%
mutate(Group = case_when(Distance_mid_to_TAD < 36098 ~ "Q1",
Distance_mid_to_TAD >= 36098 & Distance_mid_to_TAD < 94271 ~ "Q2",
Distance_mid_to_TAD >= 94271 & Distance_mid_to_TAD < 203620 ~ "Q3",
Distance_mid_to_TAD > 203620 ~ "Q4"))
final <- data.frame(test[test$HGNC.symbol %in% Lisa_genes_match_DF$HGNC.symbol ,]) #for some reason IRF4 matches to the wrong nearest TAD
#adjustment for the correct IRF4 nearest TAD
IRF4 <- Genes[Genes$HGNC.symbol == "IRF4", ]
IRF4_GR <- makeGRangesFromDataFrame(IRF4, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
IRF4 <- cbind(TADs_GM12878[nearest(IRF4_GR, TADs_GM12878)], IRF4)
IRF4 <- IRF4 %>%
select("seqnames", "start", "end", "Gene.stable.ID", "HGNC.symbol", "Chromosome.scaffold.name", "Gene.start..bp.", "Gene.end..bp.", "Gene.type", "mid_point") %>%
mutate(Distance_mid_to_TAD = min(abs(IRF4$mid_point - IRF4$start),abs(IRF4$mid_point - IRF4$end))) %>%
mutate(Group = "Q1")
final <- final[-5,]
final <- rbind(IRF4, final)
final <- data.frame(table(final$Group))
levels(final$Var1) <- c("Q1", "Q2", "Q3", "Q4")
ggplot(final, aes(x=factor(Var1), y = Freq)) +
geom_boxplot() +
xlab("Quartile")+
ylab("Frequency")