#!/usr/bin/perl -w # # This code was developed by Adam Sadovsky and Gal Chechik # (C) 2005-2008 # It can be used for non-commercial purposes. # use strict; use warnings; use constant S_MODE => 0; # strong mode use constant W_MODE => 1; # weak mode use constant CMPD => "_cmpd"; # init vars my $sva = 5; my $svd = 2; my $saa = 2; my $sad = 5; my $spa = 2; my $spd = 2; my $st = 9; my $stc = 5; my $sc = 6; my $wva = 5; my $wvd = 2; my $waa = 2; my $wad = 5; my $wpa = 2; my $wpd = 2; my $wt = 4; my $wtc = 3; my $wc = 0; my $wyd = 2; my $wha = 2; my $wyt = 1; # tail length my $wht = 1; # tail length my $vand = 1; my $vnec = 1; my $aand = 1; my $anec = 1; # binary my $data_in = "reactions.tab"; my $out_dir = "motifs"; my $out_ext = ".fm2"; my $out_sep = ""; my $out_delim = "\\t"; my $clean_mode = 1; my $verbose = 2; my $allow_intersect = 0; my $filter_orf = 1; my $prog = "findmotifs"; my $config = "config.dat"; my $s_motifs = "motifs_s.dat"; my $w_motifs = "motifs_w.dat"; my $tmp_ext = ".tmp"; my $aux_ext = ".aux"; # aux files my $data_orf = "data_orf$aux_ext"; my $gene_map = "gene_map$aux_ext"; my $gene_ids = "gene_ids$aux_ext"; my $nodes_lhs = "nodes_lhs$aux_ext"; my $nodes_rhs = "nodes_rhs$aux_ext"; my $genes = "genes$aux_ext"; my $reaction_map = "reaction_map$aux_ext"; my $data_out_s = "data_out_s$aux_ext"; my $node_map_s = "node_map_s$aux_ext"; my $node_ids_s = "node_ids_s$aux_ext"; my $data_out_w = "data_out_w$aux_ext"; my $node_map_w = "node_map_w$aux_ext"; my $node_ids_w = "node_ids_w$aux_ext"; my $cmd; my $rc; # get args while(@ARGV) { my $arg = shift @ARGV; if ($arg eq '--help') { print STDOUT ; exit(0); } elsif ($arg eq '-sva') { $sva = shift @ARGV; } elsif ($arg eq '-svd') { $svd = shift @ARGV; } elsif ($arg eq '-saa') { $saa = shift @ARGV; } elsif ($arg eq '-sad') { $sad = shift @ARGV; } elsif ($arg eq '-spa') { $spa = shift @ARGV; } elsif ($arg eq '-spd') { $spd = shift @ARGV; } elsif ($arg eq '-st' ) { $st = shift @ARGV; } elsif ($arg eq '-stc') { $stc = shift @ARGV; } elsif ($arg eq '-sc' ) { $sc = shift @ARGV; } elsif ($arg eq '-wva') { $wva = shift @ARGV; } elsif ($arg eq '-wvd') { $wvd = shift @ARGV; } elsif ($arg eq '-waa') { $waa = shift @ARGV; } elsif ($arg eq '-wad') { $wad = shift @ARGV; } elsif ($arg eq '-wpa') { $wpa = shift @ARGV; } elsif ($arg eq '-wpd') { $wpd = shift @ARGV; } elsif ($arg eq '-wt' ) { $wt = shift @ARGV; } elsif ($arg eq '-wtc') { $wtc = shift @ARGV; } elsif ($arg eq '-wc' ) { $wc = shift @ARGV; } elsif ($arg eq '-wyd') { $wyd = shift @ARGV; } elsif ($arg eq '-wha') { $wha = shift @ARGV; } elsif ($arg eq '-wyt') { $wyt = shift @ARGV; } elsif ($arg eq '-wht') { $wht = shift @ARGV; } elsif ($arg eq '-vand') { $vand = shift @ARGV; } elsif ($arg eq '-vnec') { $vnec = shift @ARGV; } elsif ($arg eq '-aand') { $aand = shift @ARGV; } elsif ($arg eq '-anec') { $anec = shift @ARGV; } elsif ($arg eq '-f' ) { $data_in = shift @ARGV; } elsif ($arg eq '-odir') { $out_dir = shift @ARGV; } elsif ($arg eq '-oe' ) { $out_ext = shift @ARGV; } elsif ($arg eq '-os' ) { $out_sep = shift @ARGV; } elsif ($arg eq '-ode' ) { $out_delim = shift @ARGV; } elsif ($arg eq '-v' ) { $verbose = shift @ARGV; } elsif ($arg eq '-i' ) { $allow_intersect = shift @ARGV; } elsif ($arg eq '-c' ) { $clean_mode = shift @ARGV; } elsif ($arg eq '-orf' ) { $filter_orf = shift @ARGV; } else { die("** Error: findmotifs.pl received bad argument $arg (use --help for help) **\n"); } } # check args if ($out_sep =~ /\./) { die("** Error: findmotifs.pl out_separator must not contain periods **\n"); } if ($out_dir !~ /\/$/) { $out_dir .= "/"; } # filter orf if ($filter_orf) { if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: filter orf ($data_in => $data_orf)\n"; } open(DATA_IN, "$data_in") or die("** Error: findmotifs.pl couldn't open file $data_in **\n"); open(DATA_ORF, "> $data_orf") or die("** Error: findmotifs.pl couldn't open file $data_orf **\n"); my $reaction; my $n_orfs = 0; while ($reaction = ) { my $gene = substr($reaction, 0, index($reaction, "\t")); #if (isOrf($gene)) { ### Removed by Gal Oct 2007, to allow reactions-motifs print DATA_ORF $reaction; $n_orfs++; #} } close DATA_IN; close DATA_ORF; if ($n_orfs < 1) { die("** ERROR: findmotifs.pl: No reactions left after filtering for ORFs **\n"); } } # prepare data if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: prepare data\n"; } if ($filter_orf) { $data_in = $data_orf; } $cmd = "cut -f1 $data_in > $genes;"; $cmd .= "nl -v0 < $genes | gawk '{print \$2\"\\t\"\$1}' > $gene_map;"; $cmd .= "cut -f2 $gene_map > $gene_ids;"; $cmd .= "cut -f2 $data_in > $nodes_lhs;"; $cmd .= "cut -f3 $data_in > $nodes_rhs;"; $cmd .= "cat $nodes_lhs $nodes_rhs | sort | uniq | nl -v0 | gawk '{print \$2\"\\t\"\$1}' > $node_map_s;"; $cmd .= "cut -f2,3 $data_in | translate_byhash.pl -f $node_map_s > $node_ids_s;"; $cmd .= "paste $gene_ids $node_ids_s > $data_out_s;"; $cmd .= "cat $nodes_lhs $nodes_rhs | sed 's/+/\\n/g' | sort | uniq | nl -v0 | gawk '{print \$2\"\\t\"\$1}' > $node_map_w;"; $cmd .= "cut -f2,3 $data_in | sed 's/+/\\t+\\t/g' | translate_byhash.pl -f $node_map_w | sed 's/\\t+\\t/+/g' > $node_ids_w;"; $cmd .= "paste $gene_ids $node_ids_w > $data_out_w;"; # $cmd .= "paste $genes $nodes_lhs $nodes_rhs $gene_ids | gawk '{print \$1\"_\"\$2\">\"\$3\"\\t\"\$4}' > $reaction_map;"; $cmd .= "paste $genes $gene_ids | gawk '{print \$1\"\\t\"\$2}' > $reaction_map;"; if ($verbose > 3) { print STDERR "cmd=$cmd\n"; } $rc = system $cmd; if ($rc > 0) { die("** Error: findmotifs.pl failed to prepare data **\n"); } # ======================================== # strong mode # ======================================== # make config file (s mode) if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: make config file $config (s mode)\n"; } open(CONFIG_FILE, "> $config") or die("** Error: findmotifs.pl couldn't open file $config **\n"); print CONFIG_FILE "graph_mode = ".S_MODE."\n\n"; print CONFIG_FILE "v_anc_threshold = $sva\n"; print CONFIG_FILE "v_des_threshold = $svd\n"; print CONFIG_FILE "a_anc_threshold = $saa\n"; print CONFIG_FILE "a_des_threshold = $sad\n"; print CONFIG_FILE "p_anc_threshold = $spa\n"; print CONFIG_FILE "p_des_threshold = $spd\n"; print CONFIG_FILE "t_threshold = $st\n"; print CONFIG_FILE "tchn_threshold = $stc\n"; print CONFIG_FILE "c_threshold = $sc\n\n"; print CONFIG_FILE "out_directory = $out_dir\n"; print CONFIG_FILE "out_extension = $out_ext\n"; print CONFIG_FILE "out_separator = $out_sep\n"; print CONFIG_FILE "out_delimiter = $out_delim\n\n"; print CONFIG_FILE "data_file = $data_out_s\n"; print CONFIG_FILE "tmp_extension = $tmp_ext\n"; print CONFIG_FILE "verbose = $verbose\n"; print CONFIG_FILE "allow_intersect = $allow_intersect\n"; close(CONFIG_FILE); runProgAndParse(S_MODE); # ======================================== # weak mode # ======================================== # make config file (w mode) if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: make config file $config (w mode)\n"; } open(CONFIG_FILE, "> $config") or die("** Error: findmotifs.pl couldn't open file $config **\n"); print CONFIG_FILE "graph_mode = ".W_MODE."\n\n"; print CONFIG_FILE "v_anc_threshold = $wva\n"; print CONFIG_FILE "v_des_threshold = $wvd\n"; print CONFIG_FILE "a_anc_threshold = $waa\n"; print CONFIG_FILE "a_des_threshold = $wad\n"; print CONFIG_FILE "p_anc_threshold = $wpa\n"; print CONFIG_FILE "p_des_threshold = $wpd\n"; print CONFIG_FILE "t_threshold = $wt\n"; print CONFIG_FILE "tchn_threshold = $wtc\n"; print CONFIG_FILE "c_threshold = $wc\n"; print CONFIG_FILE "y_des_threshold = $wyd\n"; print CONFIG_FILE "h_anc_threshold = $wha\n"; print CONFIG_FILE "y_tail_threshold = $wyt\n"; print CONFIG_FILE "h_tail_threshold = $wht\n\n"; print CONFIG_FILE "allow_vand = $vand\n"; print CONFIG_FILE "allow_vnec = $vnec\n"; print CONFIG_FILE "allow_aand = $aand\n"; print CONFIG_FILE "allow_anec = $anec\n\n"; print CONFIG_FILE "out_directory = $out_dir\n"; print CONFIG_FILE "out_extension = $out_ext\n"; print CONFIG_FILE "out_separator = $out_sep\n"; print CONFIG_FILE "out_delimiter = $out_delim\n\n"; print CONFIG_FILE "data_file = $data_out_w\n"; print CONFIG_FILE "tmp_extension = $tmp_ext\n"; print CONFIG_FILE "verbose = $verbose\n"; print CONFIG_FILE "allow_intersect = $allow_intersect\n"; close(CONFIG_FILE); runProgAndParse(W_MODE); # ======================================== # clean aux # ======================================== if ($clean_mode == 1 || $clean_mode == 3) { if ($verbose == 1) { print ".\n"; } elsif ($verbose >= 2) { print "findmotifs.pl: clean aux\n"; } $rc = system "rm -f $config $data_orf $data_out_s $data_out_w $gene_map $node_map_s $node_map_w" . " $gene_ids $nodes_lhs $nodes_rhs $node_ids_s $node_ids_w $reaction_map $genes"; if ($rc > 0) { die "** Error: findmotifs.pl failed to clean aux **\n" } } # ======================================== # runProgAndParse($run_mode) # ======================================== sub runProgAndParse { my $run_mode = $_[0]; print "run_mode = $run_mode\n"; my $mode_name = ($run_mode == S_MODE) ? 's' : 'w'; my $motifs_fname = ($run_mode == S_MODE) ? $s_motifs : $w_motifs; my $node_map = ($run_mode == S_MODE) ? $node_map_s : $node_map_w; print "motifs_fname = $motifs_fname"; # run $prog if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: run $prog ($mode_name mode)\n"; } $rc = system "$prog"; if ($rc > 0) { die("** Error: findmotifs.pl failed to run $prog **\n"); } # parse $prog output if ($verbose == 1) { print "."; } elsif ($verbose >= 2) { print "findmotifs.pl: process $prog output ($mode_name mode)\n"; } open(MOTIFS_FILE, "$out_dir$motifs_fname") or die("** Error: findmotifs.pl couldn't open file $out_dir$motifs_fname **\n"); my $file; while ($file = ) { chomp($file); if ($verbose > 2) { print STDERR "$file\n"; } if ($file =~ /@{[CMPD]}$/) { my $num_cols; if ($file =~ /^[aAvV]/) { $num_cols = 3; } elsif ($file =~ /^[tT]chn(\d+)@{[CMPD]}$/) { $num_cols = $1; $num_cols = $num_cols * 2 - 1; } $cmd = "cat $out_dir$file$tmp_ext"; for (my $i = 2; $i <= $num_cols; $i += 2) { $cmd .= " | translate_byhash.pl -b -f $node_map -n $i"; } for (my $i = 1; $i <= $num_cols; $i += 2) { $cmd .= " | translate_byhash.pl -b -f $reaction_map -n $i"; } $cmd .= " | sort | uniq > $out_dir$file$out_ext"; if ($verbose > 3) { print STDERR "cmd=$cmd\n"; } $rc = system $cmd; if ($rc > 0) { die "** Error: findmotifs.pl failed to process $prog output **\n" } } else { $cmd = "translate_byhash.pl -b -f $reaction_map"; $cmd .= " < $out_dir$file$tmp_ext | sort | uniq > $out_dir$file$out_ext"; if ($verbose > 3) { print STDERR "cmd=$cmd\n"; } $rc = system $cmd; if ($rc > 0) { die "** Error: findmotifs.pl failed to process $prog output **\n" } } if ($clean_mode == 1 || $clean_mode == 2) { $rc = system "rm -f $out_dir$file$tmp_ext"; if ($rc > 0) { die "** Error: findmotifs.pl failed to clean tmp **\n" } } } close MOTIFS_FILE; } # ======================================== # $bool = isOrf($string) # ======================================== sub isOrf { my $s = $_[0]; if ($s =~ /^(Y[A-P][LR]\d{3}[CW](-[A-Z])?)$/i) { return 1; } return 0; } __DATA__ findmotifs.pl ------------- SYNTAX: findmotifs.pl [OPTIONS] OPTIONS: Option Description Default ------------------------------------------------------------- -f name of data file reactions.tab -odir output file directory motifs -oe output file extension .fm2 -os output file separator [none] -ode output file delimiter \\t -c clean_mode: remove none = 0 1 remove all = 1 remove tmp = 2 remove aux = 3 -orf use orf filter (0-1) 1 -v verbosity level (0-4) 2 -i allow intersections in motifs (0-1) 0 -sva strong V motif ancestor threshold 5 -svd strong V motif descendant max dist 2 -saa strong A motif ancestor max dist 2 -sad strong A motif descendant threshold 5 -spa strong P motif ancestor max dist 2 -spd strong P motif descendant max dist 2 -st strong T motif max dist 9 -stc strong T motif max chain length (2+) 5 -sc strong C motif max length 6 -wva weak v motif ancestor threshold 5 -wvd weak v motif descendant max dist 2 -waa weak a motif ancestor max dist 2 -wad weak a motif descendant threshold 5 -wpa weak p motif ancestor max dist 2 -wpd weak p motif descendant max dist 2 -wt weak t motif max dist 5 -wtc weak t motif max chain length (2+) 3 -wc weak c motif max length 0 -wyd weak y motif descendant max dist 2 -wha weak h motif ancestor max dist 2 -wyt weak y motif max tail length 1 -wht weak h motif max tail (head) length 1 -vand allow search for vand motifs (0-1) 1 -vnec allow search for vnec motifs (0-1) 1 -aand allow search for aand motifs (0-1) 1 -anec allow search for anec motifs (0-1) 1 EXAMPLES: findmotifs.pl -f reaction_list.tab outputs will be saved in a subdirectory named 'motifs' findmotifs.pl -f reaction_list.tab -sva 4 -svd 1 -st 0 -oe .dat -ode \\s -odir mymotifs -vnec 0