File: add_annotations_to_GO_and_lengths_file.R

package info (click to toggle)
trinityrnaseq 2.15.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 468,004 kB
  • sloc: perl: 49,905; cpp: 17,993; java: 12,489; python: 3,282; sh: 1,989; ansic: 985; makefile: 717; xml: 62
file content (54 lines) | stat: -rw-r--r-- 2,729 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


# contributed by: Satyajeet Khare  satyajeetkhare@gmail.com
# May 3, 2018

# If you modified your gene count matrix or transcript count matrix to add functional annotations using ...
# Trinity/Analysis/DifferentialExpression/rename_matrix_feature_identifiers.pl script, you will get an error
# if you run Trinity/Analysis/DifferentialExpression/analyze_diff_expr.pl script for Gene Ontology analysis.
# The error will be such as "Error in gene_lengths[features_with_GO, ] : subscript out of bounds".
# You will get this error because of descrepancy between Trinity gene ids in "go_annotations" file and ...
# annotated gene IDs in "gene.matrix" file in edger or deseq out folder.
# To correct this error, modify the go_annotation file and gene_length file using the script below.

setwd("Your_working_directory/")

# Create data object for Trinity.gene.lengths
Trinity_gene_lengths <- read.csv("Trinity.gene_lengths.txt", sep = "\t")

# Create data object for go_annotations
go_annotations <- read.csv("go_annotations.txt", sep = NULL, header = FALSE)

# Create data object for annot_feature_map
annot_feature_map <- read.csv("annot_feature_map.txt", sep = "\t", header = FALSE)

# Modify Trinity_gene_lengths data object by adding a third column with Annotations
Trinity_gene_lengths_mod <- left_join(Trinity_gene_lengths, annot_feature_map, by = c("X.gene_id" = "V1"))

# Modify third column by replacing NA with Trinity gene IDs
setDT(Trinity_gene_lengths_mod)[is.na(V2), V2 := X.gene_id]

# Relace the "X.gene_id" values with values in the third column
Trinity_gene_lengths_mod[, "X.gene_id"] <- Trinity_gene_lengths_mod$V2

# Delete the third column. Its no longer required.
Trinity_gene_lengths_mod$V2 <- NULL

# Write a modified gene length file. This file will be used for analysis of differential expression
write.table(Trinity_gene_lengths_mod, file = "Trinity_gene_lengths_mod.txt", quote = FALSE, row.names = FALSE, sep = "\t")

# Modify go_annotations by adding a third column with Annotations
go_annotations_mod <- left_join(go_annotations, annot_feature_map, by = c("V1" = "V1"))

# Relace the "V1" values with values in the third column (V2.y)
go_annotations_mod[, "V1"] <- go_annotations_mod$V2.y

# Delete the third column. Its no longer required.
go_annotations_mod$V2 <- NULL

# Write a modified go annotation file. This file will be used for analysis of differential expression
write.table(go_annotations_mod, file = "go_annotations_mod.txt", quote = FALSE, row.names = FALSE, sep = "\t")

# Use "Trinity_gene_lengths_mod.txt" in place of "Trinity.gene_lengths.txt" 
# and "go_annotations_mod.txt" in place of "go_annotations.txt"
# in "Trinity/Analysis/DifferentialExpression/analyze_diff_expr.pl"script.