From:	SMTP%"clark@mshri.utoronto.ca" 12-MAR-1990 17:36
To:	GILBERTD
Subj:	FAMAIL shell update

Date: Mon, 12 Mar 90 17:28:19 EST
Message-Id: <9003122228.AA14079@lash.utcs.utoronto.ca>
From: clark@mshri.utoronto.ca
To: @mail-shells.dis@lash.utcs.utoronto.ca
Subject: FAMAIL shell update

Fellow GCGer,

   Here is the modified version of the DCL shell FAMAIL. It has been updated
to allow searching of the new GenBank database GenPept, which is a translation
of the protein-coding regions of the regular GenBank sequences.

   I have also changed the shell to make it impossible to search a protein
sequence against a DNA database, and vice versa. One of our post-docs tried
searching a peptide against GenBank and got back at least 200 pages
of junk. I hope this modification will cut down the net traffic a bit.

   As always, let me know if you discover any bugs.


Stephen Clark, Ph.D.

Computer Resources Manager
Samuel Lunenfeld Research Institute
Mount Sinai Hospital
600 University Avenue
Toronto, Ontario
Canada  M5G 1X5

clark@mshri.utoronto.ca  (Internet)
sinai@utoroci            (Netnorth/Bitnet)

**************
$	! FAMAIL.COM
$
$	! March 12, 1990
$
$	! Written by Steve Clark
$	! Mt. Sinai Hospital Research Institute, Toronto, Canada
$
$	! Command procedure to send a sequence to GenBank to have a FASTA
$	! search performed on it. THis procedure asks all the relevant 
$	! questions, constructs a text file with the sequence in Intelligenetics
$	! format, and mails it to GenBank. It accepts the name of the query
$	! sequence on the command line as P1.
$	! Note: the symbol SEARCH_ADDRESS is the network address for the Genbank
$	! Search service. This will have to be changed to accomodate local
$	! gateways, etc.
$
$	on control_y then goto terminate
$	bell[0,7] = 7
$	ws := "write sys$output"
$	iq := inquire/nopunctuation
$
$	! The Internet address for sending the search file is
$	! SEARCH@GENBANK.BIO.NET
$
$	search_address := "lash::genbank.bio.net::search"
$
$	ws ""
$	ws "This procedure initiates a FASTA search for similarity between"
$	ws "your query sequence and one of the databases maintained by GenBank."
$	ws "The information required for executing the search is sent to"
$	ws "GenBank via electronic mail and is executed by the GenBank people"
$	ws "themselves. Their databases are much more current than our local"
$	ws "ones, and their computer is very fast. You must specify which"
$	ws "strand to search, since GenBank only searches ONE of the strands"
$	ws "of your sequence. The results of the search will be returned to"
$	ws "you via e-mail."
$	ws ""
$	ws "Please remember that if you submit a second search against ALL of"
$	ws "GenBank or EMBL while a previous search is still waiting to be"
$	ws "executed, the previous search will be aborted."
$
$	seqname := "''p1'"
$	if(seqname.NES."") then goto no_query
$
$get_query:
$
$	ws ""
$	iq seqname "GenBank FASTA with what query sequence? "
$	if(seqname.EQS."") then goto get_query
$
$no_query:
$
$	! See if the sequence exists
$
$	assign/user_mode nl: sys$output
$	seqinfo/infile='seqname'
$	if(seqinfotype.NES."NONE") then goto check_gcg
$	ws ""
$	ws "''bell'''seqname' doesn't exist. Please try again."
$	goto get_query
$
$check_gcg:
$
$	! Check if the sequence is in GCG format.
$
$	if(seqinfotype.NES."NOGCG") then goto get_start
$	ws ""
$	ws "''bell'''seqname' is not a legitimate GCG sequence file!"
$	ws ""
$	ws "Select option by number -"
$	ws ""
$	ws "1) Specify another sequence"
$	ws "2) Quit"
$	ws ""
$	iq choice "Choice (* 1 *) ? "
$	if(choice.EQS."2") then exit
$	goto get_query
$
$get_start:
$
$    	ws ""
$    	iq begin "Begin (* 1 *) ? "
$    	if (begin.EQS."") then begin := 1
$	ibegin = f$integer(begin)
$	if((ibegin.GE.1).AND.(ibegin.LT.f$integer(seqinfolength))) then -
   			goto get_end
$	ws "''bell'The start must be between 1 and ''seqinfolength'."
$	goto get_start
$
$get_end:
$
$    	iq end "End (* ''seqinfolength' *) ? "
$    	if (end.EQS."") then end := 'seqinfolength'
$	iend = f$integer(end)
$	if((iend.GT.ibegin).AND.(iend.LE.f$integer(seqinfolength))) then -
   			goto get_reverse
$	ws "''bell'The start must be between 1 and ''seqinfolength'."
$	goto get_end
$
$get_reverse:
$
$	iq reverse "Reverse (* No *)? "
$	if(reverse.EQS."") then reverse := NO
$	if(f$extract(0,1,reverse).EQS."Y") then reverse := "YES"
$	if(f$extract(0,1,reverse).EQS."N") then reverse := "NO"
$	if(reverse.EQS."YES") then goto get_database
$	if(reverse.EQS."NO") then goto get_database
$	ws "''bell'Please answer Yes or No."
$	goto get_reverse
$
$get_database:
$
$	! Find out which database to search. If the sequence is DNA, the
$	! default is the Genbank database. The default for proteins is
$	! the SWISS-PROT database
$
$	ws ""
$	ws "Database to search:"
$	ws ""
$	if(seqinfotype.EQS."PROTEIN") then goto get_pepdatabase
$	ws " 1) ALL of GenBank"
$	ws " 2) GenBank Primate sequences"
$	ws " 3) GenBank Rodent sequences"
$	ws " 4) Other GenBank Mammalian sequences"
$	ws " 5) Other GenBank Vertebrate sequences"
$	ws " 6) GenBank Invertebrate sequences"
$	ws " 7) GenBank Plant sequences"
$	ws " 8) GenBank Bacterial sequences"
$	ws " 9) GenBank Organelle sequences"
$	ws "10) GenBank Phage sequences"
$	ws "11) GenBank Viral sequences"
$	ws "12) GenBank Structural RNA sequences"
$	ws "13) GenBank Synthetic sequences"
$	ws "14) GenBank Unannotated sequences"
$	ws "15) New GenBank sequences since the last quarterly release"
$	ws "16) ALL of EMBL"
$	ws "17) New EMBL sequences since the last release
$	ws ""
$	iq choice "Please enter choice (* 1 *): "
$	if(choice.EQS."") then choice := 1
$	database := ""
$	if(choice.EQS."1") then database := GENBANK/ALL
$	if(choice.EQS."2") then database := GENBANK/PRIMATE
$	if(choice.EQS."3") then database := GENBANK/RODENT
$	if(choice.EQS."4") then database := GENBANK/OTHER_MAMMALIAN
$	if(choice.EQS."5") then database := GENBANK/OTHER_VERTEBRATE
$	if(choice.EQS."6") then database := GENBANK/INVERTEBRATE
$	if(choice.EQS."7") then database := GENBANK/PLANT
$	if(choice.EQS."8") then database := GENBANK/BACTERIAL
$	if(choice.EQS."9") then database := GENBANK/ORGANELLE
$	if(choice.EQS."10") then database := GENBANK/PHAGE
$	if(choice.EQS."11") then database := GENBANK/VIRAL
$	if(choice.EQS."12") then database := GENBANK/STRUCTURAL_RNA
$	if(choice.EQS."13") then database := GENBANK/SYNTHETIC
$	if(choice.EQS."14") then database := GENBANK/UNANNOTATED
$	if(choice.EQS."15") then database := GENBANK/NEW
$	if(choice.EQS."16") then database := EMBL/ALL
$	if(choice.EQS."17") then database := EMBL/NEW
$	if(database.NES."") then goto get_wordsize
$	ws "''bell'Valid responses are 1 - 17, inclusive."
$	goto get_database
$
$get_pepdatabase:
$
$	ws "1) ALL of SWISS-PROT"
$	ws "2) ALL of the translated GenBank sequences"
$	ws "3) New translated GenBank sequences since last quarterly release"
$	ws ""
$	iq choice "Please enter choice (* 1 *): "
$	if(choice.EQS."") then choice := 1
$	database = ""
$	if(choice.EQS."1") then database := SWISS-PROT/ALL
$	if(choice.EQS."2") then database := GENPEPT/ALL
$	if(choice.EQS."3") then database := GENPEPT/NEW
$	if(database.NES."") then goto get_wordsize
$	ws "''bell'Valid responses are 1 - 3, inclusive."
$	goto get_database
$
$get_wordsize:
$
$	! Find out how long the word should be
$
$	if(seqinfotype.EQS."PROTEIN") then goto get_pwordsize
$
$	ws ""
$	iq wordsize "What word size (* 4 *)? "
$	if(wordsize.EQS."") then wordsize := 4
$	if((f$integer(wordsize).GT.2).AND.(f$integer(wordsize).LT.7)) -
			then goto get_nscores
$	ws "''bell'Word size must be in the range from 3 to 6."
$	goto get_wordsize
$
$get_pwordsize:
$
$	ws ""
$	iq wordsize "What word size (* 1 *)? "
$	if(wordsize.EQS."") then wordsize := 1
$	if((f$integer(wordsize).GT.0).AND.(f$integer(wordsize).LT.3)) -
			then goto get_nscores
$	ws "''bell'Word size must be in the range from 1 to 2."
$	goto get_pwordsize
$
$get_nscores:
$
$	ws ""
$	iq nscores "List how many best scores (* 100 *)? "
$	if(nscores.EQS."") then nscores := 100
$	if(f$integer(nscores).GE.10) then goto get_nalign
$	ws "''bell'At least 10 scores should be listed."
$	goto get_nscores
$
$get_nalign:
$
$	ws ""
$	iq nalign "Align how many matches (* 20 *)? "
$	if(nalign.EQS."") then nalign := 20
$	if(nalign.GT.4) then goto summarize
$	ws "''bell'At least 5 alignments should be shown."
$	goto get_nalign
$
$summarize:
$
$	ws ""
$	ws ""
$	ws "The following FASTA search will be executed:"
$	ws ""
$	if(reverse.EQS."NO") then ws "Query sequence: ''seqname' from ", -
   			"''begin' to ''end' (''seqinfotype')"
$	if(reverse.EQS."YES") then ws "Query sequence: Reverse of ''seqname'", -
   			" from ''begin' to ''end' (''seqinfotype')"
$	ws "Database to be searched: ''database'"
$	ws "Word size: ''wordsize'"
$	ws "Scores to list: ''nscores'"
$	ws "Alignments to show: ''nalign'"
$	ws ""
$	iq choice "Are these parameters correct (* Yes *)? "
$	choice = f$extract(0, 1, choice)
$	if(choice.EQS."") then goto do_it
$	if(choice.EQS."Y") then goto do_it
$
$	! Something is wrong. Give the chance to correct it, or give up.
$
$ask_repeat:
$
$	ws ""
$	ws "Do you want to"
$	ws ""
$	ws "1) Try again"
$	ws "2) Give up"
$	ws ""
$	iq choice "Please enter the number of your choice (* 1 *): "
$	if(choice.eqs."") then goto get_query
$	if(choice.eqs."1") then goto get_query
$	if(choice.eqs."2") then exit
$	ws "''bell'Wasn't the question simple enough for you?"
$	goto ask_repeat
$
$do_it:
$
$	! Determine the root name of the sequence for comparison. This in used
$	! in specifying the output file name, which has the extension .FAM.
$
$	! Check if the sequence is a file. If not, assume it is a database
$	! entry and get the locus name to use as a root.
$
$	seqroot = seqname
$	root = f$parse(seqroot,,,"NAME") ! root name of sequence
$	if(root.NES."") then goto set_outname
$
$	! Remove database name
$
$	pos = 'f$locate(":", seqroot)'
$	len = 'f$length(seqroot)'
$	if(pos.NE.len) then root = f$extract(pos+1, len-pos, seqroot)
$
$ set_outname:
$
$	outname := "''root'.FAM"
$
$	! Convert the sequence to Intelligenetics format. Since ToIG does not
$	! allow command line input, need to construct a little command 
$	! procedure to substitute in the appropriate sequence names.
$
$	ws ""
$	ws "Converting ''seqname' to IntelliGenetics format..."
$	ws ""
$
$	open/write comfile fam.cmd
$	wc := "write comfile"
$	wc "$ set noon"
$	wc "$ set verify"
$	wc "ToIG"
$	wc "''seqname'"
$	wc "''begin'"
$	wc "''end'"
$	wc "''reverse'"	! Reverse?
$	wc "''root'.IG"	! Output file name
$	wc "$ set noverify"
$	close comfile
$	@fam.cmd
$	del fam.cmd;0
$
$	! Write the text file that will be mailed to GenBank
$
$	ws "Creating the file to be mailed to GenBank..."
$
$	open/write comfile 'outname'
$	wc "DATALIB ''database'"
$	wc "KTUP ''wordsize'"
$	wc "SCORES ''nscores'"
$	wc "ALIGNMENTS ''nalign'"
$	wc "BEGIN"
$
$	! The IG format files that GCG makes starts with a blank line, which
$	! chokes the GenBank FASTA program. Therefore it is not possible to
$	! just append the sequence file to the text file. Read it line by
$	! line, copying over only those records that aren't zero length. (At
$	! the present time there is just one blank line - the first one - but
$	! who knows what the future has in store for us?)
$
$	open/read igfile 'root'.IG
$
$read_loop:
$
$		read/end_of_file=eof igfile record
$		if(f$length(record).EQ.0) then goto read_loop
$		write comfile record
$
$	goto read_loop
$
$eof:
$
$	close igfile
$	close comfile
$	delete 'root'.IG;0
$
$	! Mail the file away.
$
$	ws "Mailing the file to GenBank..."
$
$	mail 'outname' 'search_address'
$
$	ws "The file ''outname' has been sent to GenBank."
$	ws "The results will be mailed back to you shortly."
$	ws ""
$
$	delete 'outname';0
$
$ terminate:  ! Jump here on ^y. Don't do any cleanup
$
$	exit
