-
Notifications
You must be signed in to change notification settings - Fork 0
/
nB_B-cell_partner_genes.R
120 lines (94 loc) · 5.8 KB
/
nB_B-cell_partner_genes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library( GenomicRanges)
library( tibble)
library( plyranges)
library( dplyr)
Genes <- read.table( "/home/dankent/Data/HG_Esembl98_Genes.tab", header = T, sep = "\t")
Want <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X") # no TADs on the Y chromosome in the data
Genes <- subset(Genes, Genes$Chromosome.scaffold.name %in% Want)
Genes$Chromosome.scaffold.name <- paste0( 'chr',Genes$Chromosome.scaffold.name)
PC <- "protein_coding"
Genes <- subset(Genes, Genes$Gene.type %in% PC)
Genes$width <- Genes$Gene.end..bp.- Genes$Gene.start..bp.
Genes$mid_point <- (Genes$width/2)+Genes$Gene.start..bp.
HG <- Genes$HGNC.symbol
Lisa_genes <- as.factor(c("MYC", "CEBPG", "CEBPB", "EPOR", "DDX6", "BCL2", "ETV6", "CRLF2", "CEBPA",
"CEBPD","IL3", "CEBPE", "ID4", "LHX4", "MIR125B1"))
#Lisa_genes <- as.factor(c( "ENSG00000136997", "ENSG00000153879", "ENSG00000172216", "ENSG00000187266", "ENSG00000110367", "ENSG00000171791",
# "ENSG00000139083", "ENSG00000205755", "ENSG00000245848", "ENSG00000221869", "ENSG00000164399", "ENSG00000092067",
# "ENSG00000172201", "ENSG00000121454", "ENSG00000207971"))
Lisa_genes_match <- na.omit(match(Lisa_genes, HG))
Lisa_genes_match <- Genes[Lisa_genes_match,]
Lisa_genes_match <- makeGRangesFromDataFrame(Lisa_genes_match, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
TADs_BI_GM12878 <- read.table("/home/dankent/Data/GM12878/TAD_domains/GSE63525_GM12878_primary+replicate_Arrowhead_domainlist.txt", header = TRUE)
TADs_BI_GM12878 <- TADs_BI_GM12878[ ,c(1:3)]
colnames(TADs_BI_GM12878) <- c("chromosome", "start", "end")
TADs_BI_GM12878$chromosome <- paste0( 'chr',TADs_BI_GM12878$chromosome)
TADs_BI_GM12878 <- makeGRangesFromDataFrame(TADs_BI_GM12878)
TADs_GM12878 <- data.frame(reduce_ranges(TADs_BI_GM12878))
TADs_GM12878$widthTAD <- ( TADs_GM12878$end - TADs_GM12878$start)
TADs_GM12878$startTAD <- TADs_GM12878$start
TADs_GM12878$endTAD <- TADs_GM12878$end
TADs_GM12878 <- tibble::rowid_to_column( TADs_GM12878, "ID")
TADs_GM12878 <- makeGRangesFromDataFrame( TADs_GM12878, keep.extra.columns = T )
##### get random genes and split into 500 chunks of 41 genes (same length as original) #####
real_random_ranges <- sample_n(tbl = Genes, size = 15000)
real_random_ranges <- split(real_random_ranges, sample(rep(1000)))
#since all data is now help in nested lists, I build fucntions with basic functions within, to apply across the whole list
#create Granges
real_random_ranges_to_GRanges_func <- function(list){
makeGRangesFromDataFrame(list, keep.extra.columns = TRUE, ignore.strand = TRUE, start.field = "Gene.start..bp.", end.field = "Gene.end..bp.", seqnames.field = "Chromosome.scaffold.name")
}
GR_real_random_ranges <- lapply(real_random_ranges,real_random_ranges_to_GRanges_func)
#find nearest TAD to each gene
nearest_for_list <- function(list){GenomicRanges::nearest(list, TADs_GM12878)}
near <- lapply(GR_real_random_ranges, nearest_for_list)
#identify the nearest TAD using the above
Tad_Extract <- function(list){
TADs_GM12878[list]
}
TAD_match_random <- lapply(near, Tad_Extract)
TAD_match_random <- lapply(TAD_match_random, data.frame)
real_random_ranges <- lapply(real_random_ranges, data.frame)
#extract important information
midpoint_all <- unlist(lapply(real_random_ranges, `[`, 8))
TADstart <- unlist(lapply(TAD_match_random, `[`, 8))
TADend <- unlist(lapply(TAD_match_random, `[`, 9))
#calculate the absolute distance and find the minimum distance (either TAD start of TAD end)
Distance_Random_mid_to_TAD <- data.frame(cbind(abs(midpoint_all - TADstart),abs(midpoint_all - TADend)))
Distance_Random_mid_to_TAD <- apply(Distance_Random_mid_to_TAD,1,min)
#mean average
median(Distance_Random_mid_to_TAD)
# 94390.5
Distance_Random_mid_to_TAD_df <- data.frame(Distance_Random_mid_to_TAD)
############################################
### calculate distance for actual genes ####
############################################
#near_obs <- nearest(Lisa_genes_match, TADs_GM12878)
TAD_match <- TADs_GM12878[nearest(Lisa_genes_match, TADs_GM12878)]
Distance_mid_to_TAD <- data.frame(cbind(abs(Lisa_genes_match$mid_point - TAD_match$startTAD),abs(Lisa_genes_match$mid_point - TAD_match$endTAD)))
Distance_mid_to_TAD <- apply(Distance_mid_to_TAD,1,min)
median(Distance_mid_to_TAD)
# 218970
Distance_mid_to_TAD_df <- data.frame(Distance_mid_to_TAD)
#arrange by distance to TAD boundary
Clean_up <- cbind(data.frame(Lisa_genes_match), Distance_mid_to_TAD_df, TAD_match)
Clean_up <- Clean_up[c(1:3,7,9:10,12,13)]
Clean_up %>% arrange(Distance_mid_to_TAD)
#### Visualise
boxplot(Distance_mid_to_TAD_df$Distance_mid_to_TAD, Distance_Random_mid_to_TAD_df$Distance_Random_mid_to_TAD, outline=FALSE,
ylab = "Distance from gene mid-point to TAD start/end", xlab = "Observed Random", main = "Distance from Gene to TAD boundary", col = c("#E69F00", "#56B4E9"))
###
# BCL2 Originally sits within a nest TAD so its TAD gets larger and therefore its distance to TAD boundary does too
# Of note, CEBPG and CEBPA share a TAD
# using nB TADs and B_Cell partner genes #
##########################################
# median(Distance_Random_mid_to_TAD)
#[1] 143,579.8
# median(Distance_mid_to_TAD)
#[1] 106,991.5
# Reduced ranges from 'Real_random' #
##########################################
# median(Distance_Random_mid_to_TAD)
#[1] 94,791
# median(Distance_mid_to_TAD)
#[1] 74,400.5