Microsatellite (repeat) searching

Mary Pat Reeve mpreeve at genome.wi.mit.edu
Thu Mar 10 14:06:23 EST 1994

Here is a basic piece of C code that will find all permutations of
microsatellite repeats.  The basic idea is that it moves a window over
the sequence and checks if the next window agrees with the previous
one.  REPEAT_THRESHOLD is the number of windows that must be the same
before the repeat is marked; i.e., CACACACACACACACACACA would qualify
as 10.  Once the repeat is marked, that region is marked so that it is
not searched for larger base unit repeats.  MAX_WINDOW is the max number
of units in the base repeat.  CA=2, GAC=3, etc.  The code starts at 2
and works up to MAX_WINDOW.  This algorithm also takes into account the
fact that microsatellites can be imperfect (random extra bases thrown in)
and can be compound.  Let me know if you have any questions.

/* Copyright Whitehead Institute for Biomedical Research.  All rights
reserved
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>

#define FALSE 0
#define TRUE  1

#define MAX_IMPERFECT_GAP 4    /* The default maximum number of imperfect
* bases allowed in a microsatellite */
#define NUM_MS            100   /* Maximum number of microsatellites in a seq */
#define MAX_WINDOW        6   /* The maximum base repeat unit length */
#define REPEAT_THRESHOLD  10

#define MAX_LINE 500000
#define MARGIN 250

#define MAX_SUBSEQ    500000

typedef struct {
int length;        /* Length of the microsatellite feature */
int start;         /* Start of the feature in the whole sequence */
char type[MAX_WINDOW+1];     /* Base repeat unit -- CA, TCA, GA, etc */
} REPEAT_INFO;

extern REPEAT_INFO *repeats;
int target_start, target_end;
int find_microsatellite();
void find_best_target();

REPEAT_INFO *repeats;
main(argc, argv)
int argc;
char **argv;
{
int       length, current_num, genbank_start = 0, i, start, end, num_ms;
int       start_o_target, length_o_target;
char      type_o_target[MAX_WINDOW+1];
char      seq[MAX_LINE], name, match_seq, new_seq[MAX_LINE];
char      description, origin, journal;
FILE      *ifp;

repeats = (REPEAT_INFO *) malloc ((NUM_MS+1) * sizeof(REPEAT_INFO));

/* Check that the correct arguments are given */
if (argc < 3) {
printf("Usage: %s seq_file start_num\n", argv);
exit(1);}

/* Open the files and set the target sequence*/
if((ifp = fopen(argv, "r")) == NULL){
printf("*** error: can't open %s\n", argv); exit(1);}

current_num = atoi(argv);

printf("1 set_organism %% |mus| %%\n");

while(fgets(name, MAX_LINE, ifp) != NULL){
name[strlen(name)-1] = '\0';
fgets(description, MAX_LINE, ifp);
description[strlen(description)-1] = '\0';
fgets(origin, MAX_LINE, ifp);
origin[strlen(origin)-1] = '\0';
if(strcmp(origin, " ") == 0) strcpy(origin, "");
fgets(journal, MAX_LINE, ifp);
journal[strlen(journal)-1] = '\0';
fgets(seq, MAX_LINE, ifp);
seq[strlen(seq)-1] = '\0';
length = (int)strlen(seq);
num_ms = find_microsatellite(seq, length, MAX_WINDOW);
if(num_ms == 1){
start = repeats.start;
end = start + repeats.length -1;
start_o_target = start;
length_o_target = end - start + 1;
strcpy(type_o_target,repeats.type);
if((start - MARGIN) <= 0)
start = 0;
else(start -= MARGIN);
genbank_start = start;
start_o_target -= start;
if((end + MARGIN) > length)
end = length;
else end += MARGIN;
for(i = 0; i< (end-start+1); i++){
new_seq[i] = seq[start+i];}
new_seq[i] ='\0';
strcpy(seq, new_seq);

}
if(num_ms > 1){
find_best_target(num_ms);
start = target_start;
end = target_end;
start_o_target = start;
length_o_target = end - start + 1;
strcpy(type_o_target,repeats.type); /* Not really the correct target
type in this case */
if((start - MARGIN) <= 0)
start = 0;
else(start -= MARGIN);
genbank_start = start;
start_o_target -= start;
if((end + MARGIN) > length)
end = length;
else end += MARGIN;
for(i = 0; i< (end-start+1); i++){
new_seq[i] = seq[start+i];}
new_seq[i] ='\0';
strcpy(seq, new_seq);
}

if(num_ms > 0){
/* Print out the microsatellite with certain margins around it */
printf("1 put_marker %% |D%d| %%\n", current_num);
printf("2 put_marker_name %% |D%d| |%s| %%\n", current_num, name);
printf("3 put_genbank_info %% |D%d| || |%d| |%s| |%s| |%s| %%\n",
current_num, genbank_start, origin, description, name);
printf("4 put_sequence %% |D%d| |%s| || |GenBank| |%s| %%\n",
current_num, seq, journal);
printf("5 put_insert %% |D%d| |0| |%d| %%\n",
current_num, (int)strlen(seq));
printf("6 put_target %% |D%d| |%d| |%d| |%s| %%\n",
current_num, start_o_target, length_o_target, type_o_target);
printf("5 put_FASTN %% |D%d| |0| || || %%\n", current_num);
printf("6 put_BLAST %% |D%d| |0| || || %%\n", current_num);
current_num++;
fflush(stdout);
}

}
fclose(ifp);
}

int find_microsatellite(seq, length, max_window)
char *seq;
int length, max_window;
{

int window_length = 2, unit_matches, gap, rep_count=0  , i=0, start, k, j;
char test_seq[MAX_SUBSEQ], repeat_type[MAX_WINDOW], double_repeat[MAX_WINDOW*2], gap_sequence[2*MAX_IMPERFECT_GAP+1];
char *following_window, *leading_window, *next2, *p_start;

/* Initialize the repeat structure */
for(j = 0; j<NUM_MS; j++){
repeats[j].start = 0;
repeats[j].length = 0;
strcpy(repeats[j].type, "");}

/* Read in the target string to see if we are looking for
* a number of a sequence of particular repeats
* Check to make sure the window_length >= 2 <=MAX_WINDOW
* Get max_window either from the specific repeat or
* the number in parentheses
*/

strncpy(test_seq,seq,length);  /* Make a copy to use */

while(window_length <= max_window){ /* Loop over the sequence once for each window size */
if(length > (window_length*2)){

/* Initialize all the pointers for the next pass over the sequence */
following_window = test_seq;    /* The window following behind is set to start
at the beginning of the sequence */
leading_window = following_window + window_length;
/* The leading window is just ahead */
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';
unit_matches = 0;
start = 0;
i = 0;

while(*(leading_window + window_length -1) != 0){     /* Loop over the sequence */

/* Check to see if we already found a repeat here */
if(*following_window == 'X'){
following_window += window_length;
leading_window += window_length ;
i += window_length;
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';}
else {
/* Check to see if the two windows match -- */
if(unit_matches == 0){  /* we're possibly looking at a new repeat */
start = i;
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';
}
unit_matches++;

/* we'd rather have a repeat_type without an N */
if(strchr(repeat_type, 'N') != NULL){
strncpy(repeat_type, following_window, window_length);
repeat_type[window_length] = '\0';}
if(strchr(repeat_type, 'N') != NULL){
repeat_type[window_length] = '\0';}

following_window = leading_window = test_seq;
following_window += i + window_length;  /* Advance the windows */
leading_window += i + (window_length*2);
i += window_length;

}
else{

/* if we have a decent start look ahead to see if
there's more after a short gap */
strcpy(double_repeat, repeat_type);
strcat(double_repeat, repeat_type);
strncpy(gap_sequence, (test_seq + i + window_length), (2*MAX_IMPERFECT_GAP));
gap_sequence[strlen(gap_sequence)-1] = '\0';
next2 = strstr(gap_sequence, double_repeat);
if((next2 != 0 ) && (unit_matches >=1)) {
next2 = strstr((test_seq + i + window_length), double_repeat);
following_window = next2;
i += (next2 - leading_window);
leading_window =  following_window + window_length;
unit_matches++;
}

else{
/* no match, move ahead by one */
if(unit_matches >= (REPEAT_THRESHOLD-1)){
repeats[rep_count].start = start;
p_start = test_seq + start;
repeats[rep_count].length =  leading_window - p_start;
strcpy(repeats[rep_count].type,repeat_type);

/* Mark the sequence as having a repeat so we don't go over it again */
for(k = 0; k < (repeats[rep_count].length); k++)
test_seq[(repeats[rep_count].start) + k] = 'X';

rep_count++;}
unit_matches = 0;
following_window++;
i++;
start = i;
strncpy(repeat_type, following_window, window_length);
/* Null terminate the string */
repeat_type[window_length] = '\0';
}}}}

/* Deal with a repeat that goes right to the end */
if(unit_matches >= (REPEAT_THRESHOLD-1)){
repeats[rep_count].start = start;
p_start = test_seq + start;
repeats[rep_count].length =  leading_window - p_start;
strcpy(repeats[rep_count].type,repeat_type);

/* Mark the sequence as having a repeat so we don't go over it again */
for(k = 0; k < (repeats[rep_count].length); k++)
test_seq[(repeats[rep_count].start) + k] = 'X';

rep_count++;}

/* Move up the window size */
window_length++;
}
else { window_length++; }

}
return(rep_count);

}

int ms_match(repeat_type, leading_seq, window_length)
int window_length;
{

int Ns, i, num_same = 0;

/* Keep a count of the number of Ns ... we will reject the match if
* there are too many */
Ns = 0;

for(i = 0; i < window_length; i++){
if (!((repeat_type[i] == *(leading_seq+i)) || (*(leading_seq+i) == 'N')))
return(FALSE);
/* Check that we don't have something of the form poly-A or poly-T */
if (repeat_type == repeat_type[i]) num_same++;
if(*(leading_seq+i) == 'N') Ns++;
}

/* If we made it here we have a match unless the window has too many Ns */
if(num_same == window_length) return(FALSE);
return(Ns <= (window_length/2));

}

void find_best_target(num_repeats)
int num_repeats;
{
int i, j, k;
REPEAT_INFO temp;

target_start = 0; target_end = 0;
/* Sort the repeats according to start site, see if we can fit
* all the repeats into one product */
for(i = 0; i < num_repeats; ++i)
for(j = num_repeats - 1; j > i; --j)
if(repeats[j-1].start > repeats[j].start){
temp = repeats[j-1];
repeats[j-1] = repeats[j];
repeats[j] = temp;}

if((repeats[num_repeats-1].start + repeats[num_repeats-1].length - repeats.start) < 1200){
target_start = repeats.start;
target_end = repeats[num_repeats-1].start + repeats[num_repeats-1].length - 1;}

else{
for(i = 0; i < num_repeats; ++i)
for(j = num_repeats - 1; j > i; --j)
if(repeats[j-1].length < repeats[j].length){
temp = repeats[j-1];
repeats[j-1] = repeats[j];
repeats[j] = temp;}
target_start = repeats.start;
target_end = repeats.start + repeats.length -1;
}
}

int same_repeat_type(found_type, specified_type)
char *found_type, *specified_type;
{

int found_length, specified_length;

found_length = strlen(found_type);
specified_length = strlen(specified_type);

if(found_length == specified_length){
if((strcmp(found_type, specified_type) == 0)) return(TRUE);}
return(FALSE);

}