wordpat.c

Download this program

/***********************************************************************CMOD**
 * A utility to calculate character patterns of words and dictionaries.
 * Copyright Paul Johnston, 1999 - 2000. See legal.html for details.
 *****************************************************************************/

#include <stdio.h>
#include <string.h>
#include <ctype.h>

/*****************************************************************************
 * Global constants
 *****************************************************************************/
#define MAX_OPEN 30
#define MAX_LEN 256
#define DOT_PERIOD 500
const char *base26 = "0123456789ABCDEFGHIJKLMNOP";

/*****************************************************************************
 * Function prototypes
 *****************************************************************************/
void tidy_word(char *word);
int word_pat(const char *word, char *pat);
int proc_dict(const char *dict_name, int longest_word);
void sort_words(int longest_word);

/*****************************************************************************
 * main - main entry point; interpret command line and call support routines
 *****************************************************************************/
int main(int argc, char *argv[])
{
  char word[MAX_LEN];
  char patt[MAX_LEN];
  int longest_word;
  int ii;

  printf ("\nWord Pattern Calculator, by Paul Johnston\n\n");

  /***************************************************************************
   * If there are no command line arguments, prompt user for a single word
   ***************************************************************************/
  if (argc == 1)
  {
    printf ("Enter word: ");
    fgets (word, MAX_LEN - 1, stdin);
    tidy_word(word);
    word_pat(word, patt);
    printf ("Pattern: %s\n", patt);
  }

  /***************************************************************************
   * Otherwise, command line is a list of dictionary files. Call proc_dict
   * for each dictionary, remembering the longest word of all of them. Then
   * call sort_words to compile the lot.
   ***************************************************************************/
  else
  {
    longest_word = 0;
    for (ii = 1; ii < argc; ii++)
    {
      printf("%s ", argv[ii]);
      longest_word = proc_dict(argv[ii], longest_word);
      printf("\n");
    }
    printf ("Sorting ");
    sort_words(longest_word);
    printf ("\n");
  }

  printf("\n");

  return 0;
}


/*****************************************************************************
 * tidy_word - strips non alphabetic characters and puts word into upper case
 *****************************************************************************/
void tidy_word(char *word)
{
  int ii, jj = 0;

  for (ii = 0; word[ii]; ii++)
  {
    if(isalpha(word[ii]))
    {
      word[jj++] = toupper(word[ii]);
    }
  }
  word[jj] = 0;
}


/*****************************************************************************
 * word_pat - calculate the pattern for word, and place in pat. For example:
 *    ABBA       4-1221
 *    COMPUTER   8-12345678
 *    IT         2-12
 *****************************************************************************/
int word_pat(const char *word, char *pat)
{
  char chars[26];
  int uniq_chrs;
  int pat_len;
  int ii;

  /***************************************************************************
   * chars is an array which stores the position of the first occurance of an
   * 'A' or a 'B' and so on.
   ***************************************************************************/
  memset (chars, 0, sizeof(chars));
  uniq_chrs = 0;

  /***************************************************************************
   * pattern begins "length-"
   ***************************************************************************/
  sprintf(pat, "%d-", strlen(word));
  pat_len = strlen(pat);

  /***************************************************************************
   * Loop through each character in the word
   ***************************************************************************/
  for (ii = 0; word[ii]; ii++)
  {
    /*************************************************************************
     * If this character not yet encountered, then record position
     *************************************************************************/
    if (chars[word[ii] - 'A'] == 0)
    {
      uniq_chrs++;
      chars[word[ii] - 'A'] = base26[uniq_chrs];
    }
    /*************************************************************************
     * Output position of first occurance of this character
     *************************************************************************/
    pat[pat_len] = chars[word[ii] - 'A'];
    pat_len++;
  }
  pat[pat_len] = 0;

  /***************************************************************************
   * Return length of word
   ***************************************************************************/
  return ii;
}


/*****************************************************************************
 * proc_dict - this takes a file with one word on each line, and calculates
 * the pattern of each word. The output is place in separate files based on
 * the length of the word - 1.rwl ... <n>.rwl - with each line taking the
 * format "PATTERN WORD".
 *****************************************************************************/
int proc_dict(const char *dict_name, int longest_word)
{
  char word[MAX_LEN];
  char pat[MAX_LEN];
  int word_len;
  int do_dot;
  int ii;
  FILE *dict_file;
  FILE *long_file;
  FILE *out_file[MAX_OPEN];
  char file_name[MAX_LEN];

  /***************************************************************************
   * Start by nulling out file handles, so we don't try to close them on an
   * early error.
   ***************************************************************************/
  for (ii = 0; ii < MAX_OPEN; ii++)
  {
    out_file[ii] = NULL;
  }

  /***************************************************************************
   * Open the dictionary file for reading
   ***************************************************************************/
  dict_file = fopen(dict_name, "rt");
  if (dict_file == NULL)
  {
    printf("\nERROR: Failed to open '%s' for reading\n", dict_name);
    goto ERROR;
  }

  /***************************************************************************
   * Open the output files
   ***************************************************************************/
  for (ii = 1; ii < MAX_OPEN; ii++)
  {
    sprintf(file_name, "%d.rwl", ii);
    out_file[ii] = fopen (file_name, "at");
    if(out_file[ii] == NULL)
    {
      printf("\nERROR: Failed to open '%s' for appending\n", file_name);
      goto ERROR;
    }
  }

  /***************************************************************************
   * Loop through each line of the dictionary file
   ***************************************************************************/
  for(;;)
  {
    fgets(word, MAX_LEN - 1, dict_file);
    if (feof(dict_file))
    {
      break;
    }

    /*************************************************************************
     * Tidy up word, calculate pattern and keep track of longest word so far.
     *************************************************************************/
    tidy_word(word);
    word_len = word_pat(word, pat);
    longest_word = word_len > longest_word ? word_len : longest_word;

    /*************************************************************************
     * Skip zero length words
     *************************************************************************/
    if (word_len == 0)
    {
      continue;
    }

    /*************************************************************************
     * Write the pattern and word to an output file. If the word is short,
     * this file will already be open; otherwise we have to open and close it
     * now.
     *************************************************************************/
    if (word_len < MAX_OPEN)
    {
      fprintf (out_file[word_len], "%s %s\n", pat, word);
    }
    else
    {
      sprintf(file_name, "%d.rwl", word_len);
      long_file = fopen (file_name, "at");
      if(long_file == NULL)
      {
        printf("\nERROR: Failed to open '%s' for appending\n", file_name);
        goto ERROR;
      }
      fprintf (long_file, "%s %s\n", pat, word);
      fclose(long_file);
    }

    /*************************************************************************
     * Update display periodically to show that something is happening
     *************************************************************************/
    if (do_dot == 0)
    {
      fputc('.', stdout);
      fflush(stdout);
    }
    do_dot = (do_dot + 1) % DOT_PERIOD;
  }

ERROR:
  /***************************************************************************
   * Close the output files
   ***************************************************************************/
  for (ii = 1; ii < MAX_OPEN; ii++)
  {
    if(out_file[ii] != NULL)
    {
      fclose(out_file[ii]);
    }
  }
  if(dict_file != NULL)
  {
    fclose(dict_file);
  }

  /***************************************************************************
   * Return the length of the longest word encountered
   ***************************************************************************/
  return longest_word;
}


/*****************************************************************************
 * sort_words - This formats the *.rwl files, making *.wl files. Firstly,
 * the external sort program is used to make *.swl, then these sorted files
 * are reformatted, with duplicate words being removed
 *****************************************************************************/
void sort_words(int longest_word)
{
  FILE *in_file;
  FILE *out_file;
  char file_name[MAX_LEN];
  char prev_word[MAX_LEN];
  char prev_pat[MAX_LEN];
  char pat[MAX_LEN];
  char cmd[MAX_LEN];
  char *word;
  int ii;

  /***************************************************************************
   * Loop through files from 1.rwl to <longest_word>.rwl
   ***************************************************************************/
  for (ii = 1; ii <= longest_word; ii++)
  {

    /*************************************************************************
     * Test to see if the file exists
     *************************************************************************/
    sprintf(file_name, "%d.rwl", ii);
    in_file = fopen (file_name, "rt");
    if(in_file == NULL)
    {
      continue;
    }
    fclose(in_file);

    /*************************************************************************
     * Use external sorting command to sort <ii>.rwl to <ii>.swl
     *************************************************************************/
    sprintf (cmd, "sort <%d.rwl >%d.swl", ii, ii);
    system (cmd);

    /*************************************************************************
     * Update display to show that something is happening
     *************************************************************************/
    fputc('.', stdout);
    fflush(stdout);

    /*************************************************************************
     * Open <ii>.swl for input
     *************************************************************************/
    sprintf (file_name, "%d.swl", ii);
    in_file = fopen (file_name, "rt");
    if (in_file == NULL)
    {
      printf("\nERROR: Failed to open '%s' for reading\n", file_name);
      continue;
    }

    /*************************************************************************
     * Open <ii>.wl for output
     *************************************************************************/
    sprintf (file_name, "%d.wl", ii);
    out_file = fopen (file_name, "wt");
    if (out_file == NULL)
    {
      printf("\nERROR: Failed to open '%s' for writing\n", file_name);
      fclose(in_file);
      continue;
    }

    /*************************************************************************
     * Loop through each line of <ii>.swl
     *************************************************************************/
    for(;;)
    {
      fgets(pat, MAX_LEN - 1, in_file);
      if (feof(in_file))
      {
        break;
      }

      /***********************************************************************
       * Split line into pattern and word
       ***********************************************************************/
      word = strchr(pat, ' ');
      if (word == NULL)
      {
        printf("\nERROR: Malformed line '%s' in sorted file\n", pat);
        continue;
      }
      *word = 0;
      word++;

      /***********************************************************************
       * If this is a duplicate word, then skip it
       ***********************************************************************/
      if (strcmp(prev_word, word) == 0)
      {
        continue;
      }

      /***********************************************************************
       * If this is the same pattern as last, just output word; Otherwise
       * output pattern and word
       ***********************************************************************/
      if (strcmp(prev_pat, pat) == 0)
      {
        fprintf (out_file, "%s", word);
      }
      else
      {
        fprintf (out_file, "%s\n%s", pat, word);
      }

      /***********************************************************************
       * Remember pattern and word
       ***********************************************************************/
      strcpy (prev_pat, pat);
      strcpy (prev_word, word);
    }

    /*************************************************************************
     * Close <ii>.swl and <ii>.wl
     *************************************************************************/
    fclose (out_file);
    fclose (in_file);

    /*************************************************************************
     * Delete <ii>.rwl and <ii>.swl
     *************************************************************************/
    sprintf (file_name, "%d.rwl", ii);
    remove (file_name);
    sprintf (file_name, "%d.swl", ii);
    remove (file_name);

    /*************************************************************************
     * Update display to show that something is happening
     *************************************************************************/
    fputc('.', stdout);
    fflush(stdout);
  }
}
© 1998 - 2012 Paul Johnston, distributed under the BSD License   Updated:10 Jun 2009