-
Notifications
You must be signed in to change notification settings - Fork 0
/
ENCODE_H3K27ac_joiner.R
54 lines (44 loc) · 2.3 KB
/
ENCODE_H3K27ac_joiner.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(dplyr)
GM12878_ENCODE_features <- read.delim("~/Data/GM12878/ENCODE/GM12878_ENCODE_features.tsv", header = T)
# remove every row with a 0 in every column, these are not informative and will help speed up the joiner
#long winded filter but couldn't get filter(across()) to work so filtered then subset oringinal by rownames
#copy
Original1 <- data.frame(GM12878_ENCODE_features)
Original1 <- data.frame(sapply(Original1[,4:62], as.numeric))
Original1 <- data.frame(GM12878_ENCODE_features[,1:3], Original1)
#add rownames
rownames(Original1) <- paste(Original1$seqnames, Original1$start, sep = "_")
#filter all features if EVERY column is 0
Original1 <- filter_all(Original1[4:62], any_vars(. != 0))
# check using row sums that nothing = 0
sum(rowSums(Original1) == 0 )
# rowSums(Original) useful because it shows that some rows only a 1, meaning that the code is working and only removing rows that have no 1s in.
# need to merge back to get the seqnmaes, start and end
H3K27ac_regions <- data.frame(GM12878_ENCODE_features)
#add rownames
rownames(H3K27ac_regions) <- paste(H3K27ac_regions$seqnames, H3K27ac_regions$start, sep = "_")
#use the rownames from the filtered dataframe to get the rest of the data back
H3K27ac_regions <- H3K27ac_regions[rownames(Original1),]
`
l <- list()
i = 1
while (i < nrow(H3K27ac_regions)) {
if (i %% 5000 == 0) print(paste0("We are at row ", i))
iFirst = i
while (H3K27ac_regions[i, "H3K27ac"] == 1 & H3K27ac_regions[i + 1, "H3K27ac"] == 1 & H3K27ac_regions[i, "end"] == H3K27ac_regions[i + 1, "start"] & H3K27ac_regions[i, "seqnames"] == H3K27ac_regions[i + 1, "seqnames"]) i = i + 1
iLast = i
# now we copy the last row we have checked
newRow = H3K27ac_regions[i,]
# we change the start to match the start in the beginning of the chain
newRow[1, "start"] = H3K27ac_regions[iFirst,"start"]
# we take the maximum for each of the rows that contain 0 or 1
for (j in 4:62) newRow[1, j] = sum(H3K27ac_regions[iFirst:iLast, j])
# adds this row to the end of our resulting dataframe
l[[i]] <- newRow
# increase the idex i to be ready for the next chain
i = i + 1
}
Joined_H3K27ac_GM12878 <- data.table::rbindlist(l)
rm(l)
readr::write_tsv(x = Joined_H3K27ac_GM12878, file = "~/Data/GM12878/ENCODE/Joined_Genomes/Joined_H3K27ac_GM12878.tsv", col_names = T)
save.image()