CINXE.COM
/* ** MARKOV3.C */ #include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> #include <math.h> #include <time.h> #define TRUE 1 #define FALSE 0 typedef struct wd Word; typedef struct sec /* Record of second and third words and their counts */ { Word *w; // the second word Word *z; // the third word int count; // how many times we found it struct sec *next; } Second; typedef struct wd /* Record of one word found */ { char *token; // Pointer to the token itself Second *seconds; // Words that follow it } Word; #define MAXTOKEN 40000 int nToken; /* Number of words defined */ Word *tokens[MAXTOKEN]; /* List of words and counts */ Word *tokenOne = NULL; Word *tokenTwo = NULL; #define isPunc(ch) (strchr("—,.;?!¿¡“”\"",ch)) Word *searchToken( char *token, int create ) /* Find the token in the token list, using a binary search. * * If the token is found, return a pointer to it. * * If not, add it here. We will know where to put it from where * the search stopped, and since the list of tokens is linear it's * a simple matter to update it: we just move everything down one. * * Don’t add a new token if create is false. * * If there are too many tokens to add another, we just return NULL. */ { int beg, mid, end, diff; if (nToken == 0) { /* Blank list? Easy to see where to add the new entry. */ mid = 0; } else { /* Search for the entry, using a binary search. */ beg = 0; end = nToken - 1; while (beg <= end) { mid = (beg + end) / 2; diff = strcmp( token, tokens[mid]->token ); if (!diff) /* token == mid */ return( tokens[mid] ); else if (diff > 0) /* token > mid */ beg = mid + 1; else end = mid - 1; /* token < mid */ } /* Once the loop fails, the new entry should be added * either at mid or at mid + 1, depending on the results * of the last search. */ if (diff > 0) mid++; } if (create) { if (nToken < MAXTOKEN) { /* Add the new entry. */ memmove( &tokens[mid+1], &tokens[mid], (nToken - mid) * sizeof(Word) ); Word *w = malloc(sizeof(Word)); if (!w) { printf("Out of memory\n"); exit(1); }; w->seconds = NULL; w->token = malloc( strlen(token) + 1 ); if (!w->token) { printf("Out of memory\n"); exit(1); }; strcpy( w->token, token ); tokens[mid] = w; nToken++; return( w ); } else { printf( "\nTOO MANY WORDS!\n" ); exit(1); } } /* Too many entries, or no creation wanted. Return. */ return(NULL); } /*searchToken*/ /* Find the word w in the list of seconds. * If found, increment its count. * If not, add to the list. * This is an unsorted list, but we hope it doesn't grow too large. */ void incSecondList(Word *first, Word *second, Word *third) { Second *sec; Second *last = NULL; /* Linear search for second + third words */ for (sec = first->seconds; sec; sec = sec->next) { if (!strcmp(sec->w->token, second->token) && !strcmp(sec->z->token, third->token)) break; last = sec; } /* If we found it, increment it and go */ if (sec) { sec->count++; return; } /* Didn't find it. Allocate a new one */ sec = malloc(sizeof(Second)); if (!sec) { printf("Out of memory\n"); exit(1); } sec->w = second; sec->z = third; sec->count = 1; sec->next = NULL; // Link to the new object if (last) last->next = sec; else first->seconds = sec; } /* We have a new token. * Add it to the list of words in lastToken. */ void addToken(char *token) { Word *w = searchToken(token, TRUE); if (tokenOne && tokenTwo) { incSecondList(tokenOne, tokenTwo, w); } tokenOne = tokenTwo; tokenTwo = w; } int NextToken( char **sp ) /* Gets the next token in s (if any) and processes it. * The search begins at *sp. *sp is incremented so this routine * can be called in a loop. * * RETURN VALUE * Whether there is anything else to process in this line. */ { char ch; char *s, *token; /* Skip blanks */ for (s = *sp; *s && isspace(*s); s++) ; /* Token = part of s before next blank. * Either get a string of punctuation, or a string of non-punctuation. */ if (isPunc(*s)) { for (token = s; *s && isPunc(*s) && !isspace(*s); s++) ; } else { for (token = s; *s && !isPunc(*s) && !isspace(*s); s++) *s = tolower(*s); } *sp = s; if (s - token) { /* Temporarily add a null character at end of token */ ch = *s; *s = '\0'; /* Process token */ addToken(token); *s = ch; return(TRUE); } else return(FALSE); } /*NextToken*/ // Read the corpus void ReadFile( char *fn ) { FILE *f; char raw[512]; char cooked[1024]; char remainder[512]; char *t; *remainder = '\0'; f = fopen( fn, "r" ); if (!f) { printf( "Can’t open file %s\n", fn ); return; } printf( "\nReading file %s.\n", fn ); while (fgets( raw, 100, f )) { /* Unix/Mac discrepancy with CRLF. Remove both. */ for (t = raw; *t; t++) if (*t == '\r' || *t == '\n') *t = ' '; /* Add remainder from previous line, if any */ strcpy(cooked, remainder); strcat(cooked, raw); /* Find the new remainder */ t = cooked + strlen(cooked) - 1; while (t > cooked && !isspace(*t)) t--; if (t == cooked) *remainder = '\0'; else { strcpy(remainder, t + 1); *t = '\0'; } /* Process tokens */ for (t = cooked; NextToken(&t); ) ; } /* there might be something in remainder */ for (t = remainder; NextToken(&t); ) ; fclose(f); } /*ReadFile*/ /* Print one word with special handling for punctuation */ void printNode(Word *w) { if (w) { char ch = w->token[0]; if (!isPunc(ch)) printf(" "); printf( "%s", w->token ); if (ch == '.' || ch == '?' || ch == '!') printf("\n"); } } /* Generate one word from this token */ Word *markovNode(Word *one, Word *two) { if (!one) return(NULL); /* Get the total word count */ int n = 0; Second *sec; for (sec = one->seconds; sec; sec = sec->next) { if (sec->w == two) n += sec->count; } /* Pick a random number between 0 and n - 1 */ int tgt = rand() % n; /* Go through counts again, stopping when we hit tgt */ n = 0; for (sec = one->seconds; sec; sec = sec->next) { if (sec->w == two) { n += sec->count; if (n >= tgt) { printNode(sec->z); return sec->z; } } } return NULL; } /* Generate two words from this token */ Second *markovTwo(Word *w) { if (!w) return(NULL); /* Get the total word count */ int n = 0; Second *sec; for (sec = w->seconds; sec; sec = sec->next) n += sec->count; /* Pick a random number between 0 and n - 1 */ int tgt = rand() % n; /* Go through counts again, stopping when we hit tgt */ n = 0; for (sec = w->seconds; sec; sec = sec->next) { n += sec->count; if (n >= tgt) { return sec; } } return NULL; } /* Generate a text */ void markovStart() { /* This is a kludge. * Find the first word that has over 1000 second-words. */ int i; int c; for (i = 0; i < nToken; i++) { Second *sec = NULL; c = 0; for (sec = tokens[i]->seconds; sec; sec = sec->next) c++; if (c > 25) break; } if (i >= nToken) { printf( "No start points found\n"); return; } printf("Initial token should be %s with a count of %i\n", tokens[i]->token, c ); /* Generate a text */ Word *w = tokens[i]; printNode(w); /* We need two more words */ Second *sec = markovTwo(w); if (sec) { Word *one = sec->w; Word *two = sec->z; printNode(one); printNode(two); for (i = 0; one && i < 300; i++) { Word *newtwo = markovNode(one, two); one = two; two = newtwo; } } printf("\n"); } // Process int main(int argc, char *argv[]) { int i; /* Initializes random number generator */ srand((unsigned) time(NULL)); /* Read the input file(s) */ if (argc == 1) ReadFile("4901"); else for (i = 1; i < argc; i++) ReadFile(argv[i]); printf("Total tokens in corpus: %i\n", nToken ); /* TO GET A DATABASE FILE: * Uncomment this, and pipe the output to a file. * The file can then be passed to the same program to generate text. * for (i = 0; i < nToken; i++) { printf("%s\n", tokens[i]->token); Second *sec = NULL; for (sec = tokens[i]->seconds; sec; sec = sec->next) { printf(" %s %s %i\n", sec->w->token, sec->z->token, sec->count ); } } */ /* IF YOU ARE GENERATING A DATABASE FILE, also comment out this call. */ /* Generate text */ markovStart(); }