apriori.c
上传用户:lengbin
上传日期:2010-03-31
资源大小:121k
文件大小:39k
- /*----------------------------------------------------------------------
- File : apriori.c
- Contents: apriori algorithm for finding association rules
- Author : Christian Borgelt
- History : 14.02.1996 file created
- 26.07.1996 output precision reduced
- 22.11.1996 options -b, -f, and -r added
- 24.11.1996 option -e added (add. evaluation measures)
- 18.08.1997 normalized chi^2 measure added
- option -m (minimal rule length) added
- 13.10.1997 quiet version (no output to stdout or stderr)
- 27.01.1998 adapted to changed ist_create() function
- 08.08.1998 optional input file (item appearances) added
- 02.09.1998 several assertions added
- 07.09.1998 hyperedge mode (option -h) added
- 08.12.1998 output of absolute support (option -a) added
- float changed to double
- 09.12.1998 conversion of names to a scanable form added
- 05.02.1999 long int changed to int
- 09.02.1999 input from stdin, output to stdout added
- 09.08.1999 bug in check of support parameter (<= 0) fixed
- 05.11.1999 rule evaluation measure EM_AIMP added
- 08.11.1999 output of add. rule eval. measure value added
- 16.03.2000 optional use of original rule support definition
- 01.04.2001 option -h replaced by option -t (target type)
- 26.05.2001 extended support output added (option -x)
- 09.06.2001 extended support output for item sets added
- 15.08.2001 module scan used for output formatting
- 18.11.2001 item and transaction functions made a module
- 19.11.2001 options -i, -l changed, option -y removed
- 28.12.2001 adapted to module tract, some improvements
- 11.01.2002 evaluation measures codes changed to letters
- 10.02.2002 option -q extended by a direction parameter
- 11.02.2002 memory usage minimization option added
- 09.06.2002 arbitrary supp./conf. formats made possible
- 09.01.2003 option -k (item separator) added
- 14.01.2003 check for empty transaction set added
- 12.03.2003 output of lift value (conf/prior) added
- 17.07.2003 item filtering w.r.t. usage added (option -u)
- 17.07.2003 sorting w.r.t. transaction size sum added
- 18.07.2003 maximal itemset filter added
- 11.08.2003 closed itemset filter added
- 15.08.2003 item filtering for transaction tree added
- 16.08.2003 parameter for transaction filtering added
- 18.08.2003 dynamic filtering decision based on times added
- 21.08.2003 option -j (heap sort for transactions) added
- 22.09.2003 meaning of option -j reversed (heapsort default)
- 25.03.2004 option -S added (maximal support of a set/rule)
- 09.05.2004 additional selection measure for sets added
- 28.10.2004 two unnecessary assignments removed
- 20.11.2004 bug in evaluation of -j (heap/quicksort) fixed
- 23.11.2004 absolute/relative support output changed
- 09.12.2004 semantics of option -p changed
- 25.01.2005 bug in output of absolute/relative support fixed
- 31.01.2005 another bug in this output fixed
- 20.06.2005 use of flag for "no item sorting" corrected
- ----------------------------------------------------------------------*/
- #include <stdio.h>
- #include <stdlib.h>
- #include <stdarg.h>
- #include <string.h>
- #include <math.h>
- #include <time.h>
- #include <assert.h>
- #include "scan.h"
- #include "tract.h"
- #include "istree.h"
- #ifdef STORAGE
- #include "storage.h"
- #endif
- /*----------------------------------------------------------------------
- Preprocessor Definitions
- ----------------------------------------------------------------------*/
- #define PRGNAME "apriori"
- #define DESCRIPTION "find association rules with the apriori algorithm"
- #define VERSION "version 4.27 (2005.06.20) "
- "(c) 1996-2005 Christian Borgelt"
- /* --- target types --- */
- #define TT_SET 0 /* frequent item sets */
- #define TT_MFSET 1 /* maximally frequent item sets */
- #define TT_CLSET 2 /* closed item sets */
- #define TT_RULE 3 /* association rules */
- #define TT_HEDGE 4 /* association hyperedges */
- /* --- error codes --- */
- #define E_OPTION (-5) /* unknown option */
- #define E_OPTARG (-6) /* missing option argument */
- #define E_ARGCNT (-7) /* too few/many arguments */
- #define E_STDIN (-8) /* double assignment of stdin */
- #define E_TARGET (-9) /* invalid target type */
- #define E_SUPP (-10) /* invalid support */
- #define E_CONF (-11) /* invalid confidence */
- #define E_MEASURE (-12) /* invalid evaluation measure */
- #define E_MVAL (-13) /* invalid value for measure */
- #define E_RULELEN (-14) /* invalid rule length */
- #define E_NOTAS (-15) /* no items or transactions */
- #define E_UNKNOWN (-21) /* unknown error */
- #ifndef QUIET /* if not quiet version */
- #ifdef FFLUSH
- #define MSG(x) x /* print messages */
- #else /* if to flush every output */
- #define MSG(x) x, fflush(stderr)
- #endif
- #else /* if quiet version */
- #define MSG(x) /* suppress messages */
- #endif
- #define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
- #define RECCNT(s) (tfs_reccnt(is_tfscan(s))
- + ((tfs_delim(is_tfscan(s)) == TFS_REC) ? 0 : 1))
- #define BUFFER(s) tfs_buf(is_tfscan(s))
- /*----------------------------------------------------------------------
- Constants
- ----------------------------------------------------------------------*/
- #ifndef QUIET /* if not quiet version */
- /* --- target types --- */
- static const char *ttypes[] = {
- /* TT_SET 0 */ "set",
- /* TT_MFSET 1 */ "set",
- /* TT_CLSET 2 */ "set",
- /* TT_RULE 3 */ "rule",
- /* TT_HEDGE 4 */ "hyperedge",
- };
- /* --- error messages --- */
- static const char *errmsgs[] = {
- /* E_NONE 0 */ "no errorn",
- /* E_NOMEM -1 */ "not enough memoryn",
- /* E_FOPEN -2 */ "cannot open file %sn",
- /* E_FREAD -3 */ "read error on file %sn",
- /* E_FWRITE -4 */ "write error on file %sn",
- /* E_OPTION -5 */ "unknown option -%cn",
- /* E_OPTARG -6 */ "missing option argumentn",
- /* E_ARGCNT -7 */ "wrong number of argumentsn",
- /* E_STDIN -8 */ "double assignment of standard inputn",
- /* E_TARGET -9 */ "invalid target type '%c'n",
- /* E_SUPP -10 */ "invalid minimal support %g%%n",
- /* E_CONF -11 */ "invalid minimal confidence %g%%n",
- /* E_MEASURE -12 */ "invalid additional evaluation measure %cn",
- /* E_MVAL -13 */ "invalid value %g%% for evaluation measuren",
- /* E_RULELEN -14 */ "invalid set size/rule length %dn",
- /* E_NOTAS -15 */ "no items or transactions to work onn",
- /* E_ITEMEXP -16 */ "file %s, record %d: item expectedn",
- /* E_DUPITEM -17 */ "file %s, record %d: duplicate item %sn",
- /* E_APPEXP -18 */ "file %s, record %d: "
- "appearance indicator expectedn",
- /* E_UNKAPP -19 */ "file %s, record %d: "
- "unknown appearance indicator %sn",
- /* E_FLDCNT -20 */ "file %s, record %d: too many fieldsn",
- /* E_UNKNOWN -21 */ "unknown errorn"
- };
- #endif
- /*----------------------------------------------------------------------
- Global Variables
- ----------------------------------------------------------------------*/
- #ifndef QUIET
- static char *prgname; /* program name for error messages */
- #endif
- static ITEMSET *itemset = NULL; /* item set */
- static TASET *taset = NULL; /* transaction set */
- static TATREE *tatree = NULL; /* transaction tree */
- static ISTREE *istree = NULL; /* item set tree */
- static FILE *in = NULL; /* input file */
- static FILE *out = NULL; /* output file */
- /*----------------------------------------------------------------------
- Main Functions
- ----------------------------------------------------------------------*/
- static void help (void)
- { /* --- print help on eval. measures */
- #ifndef QUIET
- fprintf(stderr, "n"); /* terminate startup message */
- printf("additional evaluation measures (option -e#)n");
- printf("frequent item sets:n");
- printf("d or 1: binary logarithm of support quotientn");
- printf("q or 2: difference of support quotient to 1n");
- printf("association rules:n");
- printf("d or 1: absolute confidence difference to priorn");
- printf("q or 2: absolute difference of confidence quotient to 1n");
- printf("a or 3: absolute difference of improvement value to 1n");
- printf("i or 4: information difference to priorn");
- printf("c or 5: normalized chi^2 measuren");
- #endif
- exit(0); /* abort the program */
- } /* help() */
- /*--------------------------------------------------------------------*/
- static void error (int code, ...)
- { /* --- print an error message */
- #ifndef QUIET /* if not quiet version */
- va_list args; /* list of variable arguments */
- const char *msg; /* error message */
- assert(prgname); /* check the program name */
- if (code < E_UNKNOWN) code = E_UNKNOWN;
- if (code < 0) { /* if to report an error, */
- msg = errmsgs[-code]; /* get the error message */
- if (!msg) msg = errmsgs[-E_UNKNOWN];
- fprintf(stderr, "n%s: ", prgname);
- va_start(args, code); /* get variable arguments */
- vfprintf(stderr, msg, args);/* print error message */
- va_end(args); /* end argument evaluation */
- }
- #endif
- #ifndef NDEBUG /* if debug version */
- if (istree) ist_delete(istree); /* clean up memory */
- if (tatree) tat_delete(tatree); /* and close files */
- if (taset) tas_delete(taset, 0);
- if (itemset) is_delete(itemset);
- if (in && (in != stdin)) fclose(in);
- if (out && (out != stdout)) fclose(out);
- #endif
- #ifdef STORAGE /* if storage debugging */
- showmem("at end of program"); /* check memory usage */
- #endif
- exit(code); /* abort the program */
- } /* error() */
- /*--------------------------------------------------------------------*/
- int main (int argc, char *argv[])
- { /* --- main function */
- int i, k = 0, n; /* loop variables, counters */
- char *s; /* to traverse the options */
- char **optarg = NULL; /* option argument */
- char *fn_in = NULL; /* name of input file */
- char *fn_out = NULL; /* name of output file */
- char *fn_app = NULL; /* name of item appearances file */
- char *blanks = NULL; /* blanks */
- char *fldseps = NULL; /* field separators */
- char *recseps = NULL; /* record separators */
- char *cominds = NULL; /* comment indicators */
- char *apps = NULL; /* item appearance indicator vector */
- double supp = 0.1; /* minimal support (in percent) */
- double smax = 1.0; /* maximal support (in percent) */
- double conf = 0.8; /* minimal confidence (in percent) */
- int rsdef = IST_BODY; /* rule support definition */
- int target = 'r'; /* target type (sets/rules/h.edges) */
- int arem = 0; /* additional rule evaluation measure */
- int lift = 0; /* flag for printing the lift */
- double minval = 0.1; /* minimal evaluation measure value */
- double lftval = 0; /* lift value (confidence/prior) */
- int minlen = 1; /* minimal rule length */
- int maxlen = 5; /* maximal rule length */
- int load = 1; /* flag for loading transactions */
- int sort = 2; /* flag for item sorting and recoding */
- double filter = 0.1; /* item usage filtering parameter */
- int tree = 1; /* flag for transaction tree */
- int heap = 1; /* flag for heap sort vs. quick sort */
- int memopt = 0; /* flag for memory usage optimization */
- int c2scf = 0; /* flag for conv. to scanable form */
- char *sep = " "; /* item separator for output */
- char *fmt = "%.1f"; /* output format for support/conf. */
- int sout = 1; /* flag for abs./rel. support output */
- int ext = 0; /* flag for extended support output */
- int aval = 0; /* flag for add. eval. measure value */
- int maxcnt = 0; /* maximal number of items per set */
- int tacnt; /* number of transactions */
- int *map, *set; /* identifier map, item set */
- const char *name; /* buffer for item names */
- static char buf[4*TFS_SIZE+4];/* buffer for formatting */
- clock_t t, tt, tc, x; /* timer for measurements */
- #ifndef QUIET /* if not quiet version */
- prgname = argv[0]; /* get program name for error msgs. */
- /* --- print usage message --- */
- if (argc > 1) { /* if arguments are given */
- fprintf(stderr, "%s - %sn", argv[0], DESCRIPTION);
- fprintf(stderr, VERSION); } /* print a startup message */
- else { /* if no arguments given */
- printf("usage: %s [options] infile outfile [appfile]n", argv[0]);
- printf("%sn", DESCRIPTION);
- printf("%sn", VERSION);
- printf("-t# target type (default: association rules)n"
- " (s: item sets, c: closed item sets,"
- " m: maximal item sets,n"
- " r: association rules,"
- " h: association hyperedges)n");
- printf("-m# minimal number of items per set/rule/hyperedge "
- "(default: %d)n", minlen);
- printf("-n# maximal number of items per set/rule/hyperedge "
- "(default: %d)n", maxlen);
- printf("-s# minimal support of a set/rule/hyperedge "
- "(default: %g%%)n", supp *100);
- printf("-S# maximal support of a set/rule/hyperedge "
- "(default: %g%%)n", smax *100);
- printf("-c# minimal confidence of a rule/hyperedge "
- "(default: %g%%)n", conf *100);
- printf("-o use original definition of the support of a rule "
- "(body & head)n");
- printf("-k# item separator for output "
- "(default: "%s")n", sep);
- printf("-p# output format for support/confidence "
- "(default: "%s")n", fmt);
- printf("-x extended support output "
- "(print both rule support types)n");
- printf("-a print absolute support "
- "(number of transactions)n");
- printf("-y print lift value (confidence divided by prior)n");
- printf("-e# additional evaluation measure (default: none)n");
- printf("-! print a list of additional evaluation measuresn");
- printf("-d# minimal value of additional evaluation measure "
- "(default: %g%%)n", minval *100);
- printf("-v print value of additional "
- "rule evaluation measuren");
- printf("-g write output in scanable form "
- "(quote certain characters)n");
- printf("-l do not load transactions into memory "
- "(work on input file)n");
- printf("-q# sort items w.r.t. their frequency (default: %d)n"
- " (1: ascending, -1: descending, 0: do not sort,n"
- " 2: ascending, -2: descending w.r.t. "
- "transaction size sum)n", sort);
- printf("-u# filter unused items from transactions "
- "(default: %g)n", filter);
- printf(" (0: do not filter items w.r.t. usage in sets,n"
- " <0: fraction of removed items for filtering,n"
- " >0: take execution times ratio into account)n");
- printf("-h do not organize transactions as a prefix treen");
- printf("-j use quicksort to sort the transactions "
- "(default: heapsort)n");
- printf("-z minimize memory usage "
- "(default: maximize speed)n");
- printf("-i# ignore records starting with a character "
- "in the given stringn");
- printf("-b/f/r# blank characters, field and record separatorsn"
- " (default: " \t\r", " \t", "\n")n");
- printf("infile file to read transactions fromn");
- printf("outfile file to write item sets/association rules"
- "/hyperedges ton");
- printf("appfile file stating item appearances (optional)n");
- return 0; /* print a usage message */
- } /* and abort the program */
- #endif /* #ifndef QUIET */
- /* --- evaluate arguments --- */
- for (i = 1; i < argc; i++) { /* traverse arguments */
- s = argv[i]; /* get option argument */
- if (optarg) { *optarg = s; optarg = NULL; continue; }
- if ((*s == '-') && *++s) { /* -- if argument is an option */
- while (*s) { /* traverse options */
- switch (*s++) { /* evaluate switches */
- case '!': help(); break;
- case 't': target = (*s) ? *s++ : 'r'; break;
- case 'm': minlen = (int)strtol(s, &s, 0); break;
- case 'n': maxlen = (int)strtol(s, &s, 0); break;
- case 's': supp = 0.01*strtod(s, &s); break;
- case 'S': smax = 0.01*strtod(s, &s); break;
- case 'c': conf = 0.01*strtod(s, &s); break;
- case 'o': rsdef = IST_BOTH; break;
- case 'k': optarg = &sep; break;
- case 'p': optarg = &fmt; break;
- case 'x': ext = 1; break;
- case 'a': sout |= 2; break;
- case 'y': lift = 1; break;
- case 'e': arem = (*s) ? *s++ : 0; break;
- case 'd': minval = 0.01*strtod(s, &s); break;
- case 'v': aval = 1; break;
- case 'g': c2scf = 1; break;
- case 'l': load = 0; break;
- case 'q': sort = (int)strtol(s, &s, 0); break;
- case 'u': filter = strtod(s, &s); break;
- case 'h': tree = 0; break;
- case 'j': heap = 0; break;
- case 'z': memopt = 1; break;
- case 'i': optarg = &cominds; break;
- case 'b': optarg = &blanks; break;
- case 'f': optarg = &fldseps; break;
- case 'r': optarg = &recseps; break;
- default : error(E_OPTION, *--s); break;
- } /* set option variables */
- if (optarg && *s) { *optarg = s; optarg = NULL; break; }
- } } /* get option argument */
- else { /* -- if argument is no option */
- switch (k++) { /* evaluate non-options */
- case 0: fn_in = s; break;
- case 1: fn_out = s; break;
- case 2: fn_app = s; break;
- default: error(E_ARGCNT); break;
- } /* note filenames */
- }
- }
- if (optarg) error(E_OPTARG); /* check option argument */
- if ((k < 2) || (k > 3)) /* and the number of arguments */
- error(E_ARGCNT); /* (either in/out or in/out/app) */
- if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
- error(E_STDIN); /* stdin must not be used twice */
- switch (target) { /* check and translate target type */
- case 's': target = TT_SET; break;
- case 'm': target = TT_MFSET; break;
- case 'c': target = TT_CLSET; break;
- case 'r': target = TT_RULE; break;
- case 'h': target = TT_HEDGE; break;
- default : error(E_TARGET, (char)target); break;
- }
- if (supp > 1) /* check the minimal support */
- error(E_SUPP, supp); /* (< 0: absolute number) */
- if ((conf < 0) || (conf > 1))
- error(E_CONF, conf); /* check the minimal confidence */
- if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
- if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
- switch (arem) { /* check and translate measure */
- case 0 : case '0': arem = EM_NONE; break;
- case 'd': case '1': arem = EM_DIFF; break;
- case 'q': case '2': arem = EM_QUOT; break;
- case 'a': case '3': arem = EM_AIMP; break;
- case 'i': case '4': arem = EM_INFO; break;
- case 'c': case '5': arem = EM_CHI2; break;
- default : error(E_MEASURE, (char)arem); break;
- }
- if ((minval < 0) /* check the measure parameter */
- || ((target == TT_RULE) && (arem != EM_AIMP) && (minval > 1)))
- error(E_MVAL, minval); /* (must usually be between 0 and 1) */
- if (target == TT_HEDGE){ /* in hyperedge mode */
- minval = conf; conf = 1;}/* adapt the parameters */
- else if (target <= TT_CLSET){ /* in item set mode neutralize */
- rsdef = IST_BOTH; conf = 1;}/* rule specific settings */
- if (arem == EM_NONE) /* if no add. rule eval. measure, */
- aval = 0; /* clear the corresp. output flag */
- if ((filter <= -1) || (filter >= 1)) filter = 0;
- /* --- create item set and transaction set --- */
- itemset = is_create(); /* create an item set and */
- if (!itemset) error(E_NOMEM); /* set the special characters */
- is_chars(itemset, blanks, fldseps, recseps, cominds);
- if (load) { /* if to load the transactions */
- taset = tas_create(itemset);
- if (!taset) error(E_NOMEM); /* create a transaction set */
- } /* to store the transactions */
- MSG(fprintf(stderr, "n")); /* terminate the startup message */
- /* --- read item appearances --- */
- if (fn_app) { /* if item appearances are given */
- t = clock(); /* start the timer */
- if (*fn_app) /* if an app. file name is given, */
- in = fopen(fn_app, "r"); /* open the item appearances file */
- else { /* if no app. file name is given, */
- in = stdin; fn_app = "<stdin>"; } /* read from std. input */
- MSG(fprintf(stderr, "reading %s ... ", fn_app));
- if (!in) error(E_FOPEN, fn_app);
- k = is_readapp(itemset,in); /* read the item appearances */
- if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
- if (in != stdin) /* if not read from standard input, */
- fclose(in); /* close the input file */
- MSG(fprintf(stderr, "[%d item(s)] done ", is_cnt(itemset)));
- MSG(fprintf(stderr, "[%.2fs].n", SEC_SINCE(t)));
- } /* print a log message */
- /* --- read transactions --- */
- t = clock(); /* start the timer */
- if (fn_in && *fn_in) /* if an input file name is given, */
- in = fopen(fn_in, "r"); /* open input file for reading */
- else { /* if no input file name is given, */
- in = stdin; fn_in = "<stdin>"; } /* read from standard input */
- MSG(fprintf(stderr, "reading %s ... ", fn_in));
- if (!in) error(E_FOPEN, fn_in);
- for (tacnt = 0; 1; tacnt++) { /* transaction read loop */
- k = is_read(itemset, in); /* read the next transaction */
- if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
- if (k > 0) break; /* check for error and end of file */
- k = is_tsize(itemset); /* update the maximal */
- if (k > maxcnt) maxcnt = k; /* transaction size */
- if (taset && (tas_add(taset, NULL, 0) != 0))
- error(E_NOMEM); /* add the loaded transaction */
- } /* to the transaction set */
- if (taset) { /* if transactions have been loaded */
- if (in != stdin) fclose(in);/* if not read from standard input, */
- in = NULL; /* close the input file */
- } /* clear the file variable */
- n = is_cnt(itemset); /* get the number of items */
- MSG(fprintf(stderr, "[%d item(s),", n));
- MSG(fprintf(stderr, " %d transaction(s)] done ", tacnt));
- MSG(fprintf(stderr, "[%.2fs].", SEC_SINCE(t)));
- if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
- MSG(fprintf(stderr, "n")); /* check for at least one transaction */
- if (supp < 0) { /* if absolute support is given */
- if (!(sout & 2)) sout = 2; /* switch to absolute support output */
- supp = (-100 *supp -0.25) /tacnt;
- if (supp < 0) supp = 0; /* compute a proper relative support */
- } /* and check it against 0 */
- if (smax < 0) { /* if absolute support is given */
- smax = (-100 *smax -0.25) /tacnt;
- if (smax < 0) smax = 0; /* compute a proper relative support */
- } /* and check it against 0 */
- /* --- sort and recode items --- */
- MSG(fprintf(stderr, "sorting and recoding items ... "));
- t = clock(); /* start the timer */
- map = (int*)malloc(is_cnt(itemset) *sizeof(int));
- if (!map) error(E_NOMEM); /* create an item identifier map */
- if (rsdef == IST_BODY) /* if rule support = body support */
- k = (int)ceil(tacnt *supp *conf);
- else /* if rule supp. = body&head support */
- k = (int)ceil(tacnt *supp); /* compute the absolut support */
- n = is_recode(itemset, k, sort, map);
- if (taset) { /* sort and recode the items and */
- tas_recode(taset, map,n); /* recode the loaded transactions */
- maxcnt = tas_max(taset); /* get the new maximal t.a. size */
- } /* (may be smaller than before) */
- free(map); /* delete the item identifier map */
- MSG(fprintf(stderr, "[%d item(s)] ", n));
- MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
- if (n <= 0) error(E_NOTAS); /* print a log message and */
- MSG(fprintf(stderr, "n")); /* check the number of items */
- /* --- create a transaction tree --- */
- tt = 0; /* init. the tree construction time */
- if (tree && taset) { /* if transactions were loaded */
- MSG(fprintf(stderr, "creating transaction tree ... "));
- t = clock(); /* start the timer */
- tatree = tat_create(taset, heap);
- if (!tatree) error(E_NOMEM);/* create a transaction tree */
- if (filter == 0) { /* if a tree rebuild is not needed, */
- tas_delete(taset, 0); taset = NULL; } /* delete transactions */
- tt = clock() -t; /* note the time for the construction */
- MSG(fprintf(stderr, "done [%.2fs].n", SEC_SINCE(t)));
- } /* print a log message */
- /* --- create an item set tree --- */
- t = clock(); tc = 0; /* start the timer */
- apps = (char*)malloc(n *sizeof(char));
- if (!apps) error(E_NOMEM); /* get the appearance indicators */
- for (apps += i = n; --i >= 0; )
- *--apps = is_getapp(itemset, i);
- istree = ist_create(n, supp, conf, rsdef, apps, memopt);
- if (!istree) error(E_NOMEM); /* create an item set tree */
- for (k = n; --k >= 0; ) /* set single item frequencies */
- ist_setcnt(istree, k, is_getfrq(itemset, k));
- ist_settac(istree, tacnt); /* set the number of transactions */
- if (maxlen > maxcnt) /* clamp the rule length */
- maxlen = maxcnt; /* to the maximum set size */
- /* --- check item subsets --- */
- MSG(fprintf(stderr, "checking subsets of size 1"));
- while (ist_height(istree) < maxlen) {
- if (filter != 0) { /* if to filter w.r.t. item usage, */
- i = ist_check(istree, apps); /* check current item usage */
- if (i < maxlen) maxlen = i; /* update the maximum size */
- if (ist_height(istree) >= i) break;
- } /* check the tree height */
- k = ist_addlvl(istree); /* while max. height is not reached, */
- if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
- if (k != 0) break; /* if no level was added, abort */
- MSG(fprintf(stderr, " %d", ist_height(istree)));
- if (tatree) { /* if a transaction tree was created */
- if (((filter < 0) /* if to filter w.r.t. item usage */
- && (i < -filter *n)) /* and enough items were removed */
- || ((filter > 0) /* or counting time is long enough */
- && (i < n) && (i *(double)tt < filter *n *tc))) {
- n = i; x = clock(); /* note the new number of items */
- tas_filter(taset,apps); /* and remove unnecessary items */
- tat_delete(tatree); /* delete the transaction tree */
- tatree = tat_create(taset, heap);
- if (!tatree) error(E_NOMEM);
- tt = clock() -x; /* rebuild the transaction tree and */
- } /* note the new construction time */
- x = clock(); /* count the transaction tree */
- ist_countx(istree, tatree);
- tc = clock() -x; } /* note the new count time */
- else if (taset) { /* if transactions were loaded */
- if (((filter < 0) /* if to filter w.r.t. item usage */
- && (i <= -filter *n)) /* and enough items were removed */
- || ((filter > 0) /* or counting time is long enough */
- && (i *(double)tt <= filter *n *tc))) {
- n = i; x = clock(); /* note the new number of items */
- tas_filter(taset,apps); /* and remove unnecessary items */
- tt = clock() -t; /* from the transactions */
- } /* note the filtering time */
- for (i = tacnt; --i >= 0;)/* traverse and count transactions */
- ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
- tc = clock() -t; } /* note the new count time */
- else { /* if to work on the input file, */
- rewind(in); /* reset the file position */
- for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
- if (filter != 0) /* (re)read the transactions and */
- is_filter(itemset, apps); /* remove unnecessary items */
- k = is_tsize(itemset); /* update the maximum size */
- if (k > maxcnt) maxcnt = k; /* of a transaction */
- ist_count(istree, is_tract(itemset), k);
- } /* count the transaction in the tree */
- if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
- if (maxcnt < maxlen) /* update the maximal rule length */
- maxlen = maxcnt; /* according to the max. t.a. size */
- } /* (may be smaller than before) */
- }
- if (!taset && !tatree) { /* if transactions were not loaded */
- if (in != stdin) fclose(in);/* if not read from standard input, */
- in = NULL; /* close the input file */
- } /* clear the file variable */
- MSG(fprintf(stderr, " done [%.2fs].n", SEC_SINCE(t)));
- /* --- filter found item sets --- */
- if ((target == TT_MFSET) || (target == TT_CLSET)) {
- MSG(fprintf(stderr, "filtering %s item sets ... ",
- (target == TT_MFSET) ? "maximal" : "closed"));
- t = clock(); /* filter the item sets */
- ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
- MSG(fprintf(stderr, "done [%.2fs].n", SEC_SINCE(t)));
- } /* (filter takes longer than print) */
- /* --- sort transactions --- */
- if (target <= TT_CLSET) { /* if to find frequent item sets */
- if (!taset) /* transactions must be loaded */
- ext = 0; /* for extended support output */
- else if (ext) { /* if extended output is requested */
- MSG(fprintf(stderr, "sorting transactions ... "));
- t = clock(); /* start the timer */
- tas_sort(taset, heap); /* sort the transactions */
- MSG(fprintf(stderr, "done [%.2fs].n", SEC_SINCE(t)));
- } /* (sorting is necessary to find the */
- } /* number of identical transactions) */
- /* --- print item sets/rules/hyperedges --- */
- t = clock(); /* start the timer */
- if (fn_out && *fn_out) /* if an output file name is given, */
- out = fopen(fn_out, "w"); /* open the output file */
- else { /* if no output file name is given, */
- out = stdout; fn_out = "<stdout>"; } /* write to std. output */
- MSG(fprintf(stderr, "writing %s ... ", fn_out));
- if (!out) error(E_FOPEN, fn_out);
- ist_init(istree, minlen, arem, minval);
- set = is_tract(itemset); /* get the transaction buffer */
- if (target <= TT_CLSET) { /* if to find frequent item sets */
- for (n = 0; 1; ) { /* extract item sets from the tree */
- k = ist_set(istree, set, &supp, &conf);
- if (k <= 0) break; /* get the next frequent item set */
- if (supp > smax) continue;/* check against maximal support */
- for (i = 0; i < k; i++) { /* traverse the set's items */
- name = is_name(itemset, set[i]);
- if (c2scf) { sc_format(buf, name, 0); name = buf; }
- fputs(name, out); /* print the name of the next item */
- fputs((i < k-1) ? sep : " ", out);
- } /* print a separator */
- fputs(" (", out); /* print the item set's support */
- if (sout & 1) { fprintf(out, fmt, supp *100);
- fputs((sout & 2) ? "%/" : "%", out); }
- if (sout & 2) { fprintf(out, "%d", (int)(supp *tacnt +0.5)); }
- if (ext) { /* if to print the extended support */
- supp = tas_occur(taset, set, k);
- fputs(", ", out); /* get the number of occurrences */
- fprintf(out, fmt, (supp/tacnt) *100);
- if (sout & 2) fprintf(out, "/%.0f", supp);
- } /* print the extended support data */
- if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
- fputs(")n", out); /* print the add. eval. measure, */
- n++; /* terminate the support output, */
- } } /* and count the item set */
- else if (target == TT_RULE) { /* if to find association rules, */
- for (n = 0; 1; ) { /* extract rules from tree */
- k = ist_rule(istree, set, &supp, &conf, &lftval, &minval);
- if (k <= 0) break; /* get the next association rule */
- if (supp > smax) continue;/* check against maximal support */
- for (i = 0; i < k; i++) { /* traverse the rule's items */
- name = is_name(itemset, set[i]);
- if (c2scf) { sc_format(buf, name, 0); name = buf; }
- fputs(name, out); /* print the next item */
- fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
- } /* print a separator */
- fputs(" (", out); /* print the rule evaluation */
- if (ext && (rsdef == IST_BODY)) {
- if (sout & 1) { fprintf(out, fmt, supp *conf *100);
- if (sout & 2) fputc('/', out); }
- if (sout & 2) { fprintf(out, "%d", (int)(supp*conf*tacnt+0.5));}
- fputs(", ", out); /* print the support of the rule */
- } /* from the support of the body */
- if (sout & 1) { fprintf(out, fmt, supp *100);
- if (sout & 2) fputc('/', out); }
- if (sout & 2) { fprintf(out, "%d", (int)(supp *tacnt +0.5)); }
- fputs(", ", out); /* print the rule support */
- if (ext && (rsdef == IST_BOTH)) {
- if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
- if (sout & 2) fputc('/', out); }
- if (sout & 2) { fprintf(out,"%d",(int)((supp/conf)*tacnt+0.5));}
- fputs(", ", out); /* print the support of the body */
- } /* from the support of the rule */
- fprintf(out, fmt, conf *100); /* print the rule confidence */
- if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
- if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
- fputs(")n", out); /* print the value of the additional */
- n++; /* rule evaluation measure and */
- } } /* count the association rule */
- else { /* if to find association hyperedges */
- for (n = 0; 1; ) { /* extract hyperedges from tree */
- k = ist_hedge(istree, set, &supp, &conf);
- if (k <= 0) break; /* get the next hyperedge */
- if (supp > smax) continue;/* check against maximal support */
- for (i = 0; i < k; i++) { /* traverse the edge's items */
- name = is_name(itemset, set[i]);
- if (c2scf) { sc_format(buf, name, 0); name = buf; }
- fputs(name, out); /* print the name of the next item */
- fputs((i < k-1) ? sep : " ", out);
- } /* print a separator */
- fputs(" (", out);
- if (sout & 1) { fprintf(out, fmt, supp *100);
- if (sout & 2) fputc('/', out); }
- if (sout & 2) { fprintf(out, "%d", (int)(supp *tacnt +0.5)); }
- fputs(", ", out); fprintf(out, fmt, conf *100);
- fputs(")n", out); /* print support and confidence */
- n++; /* of the hyperedge and */
- } /* count the hyperedge */
- } /* if (target <= TT_MFSET) .. else .. */
- if (fflush(out) != 0) error(E_FWRITE, fn_out);
- if (out != stdout) fclose(out);
- out = NULL; /* close the output file */
- MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
- MSG(fprintf(stderr, "[%.2fs].n", SEC_SINCE(t)));
- #ifdef BENCH
- printf("number of counters : %dn", istree->cnt);
- printf("necessary counters : %dn", istree->nec);
- printf("number of child pointers: %dn", istree->chcnt);
- printf("necessary child pointers: %dn", istree->chnec);
- printf("allocated bytes : %dn", istree->bytes);
- #endif
- /* --- clean up --- */
- #ifndef NDEBUG /* if this is a debug version */
- free(apps); /* delete the item app. vector */
- ist_delete(istree); /* delete the item set tree, */
- if (tatree) tat_delete(tatree); /* the transaction tree, */
- if (taset) tas_delete(taset, 0); /* the transaction set, */
- is_delete(itemset); /* and the item set */
- #endif
- #ifdef STORAGE /* if storage debugging */
- showmem("at end of program"); /* check memory usage */
- #endif
- return 0; /* return 'ok' */
- } /* main() */