GOBNILP
f164d83
|
Generates local scores from discrete data using AD trees. More...
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include "scoring.h"
#include "utils.h"
#include "versiongit.h"
Data Structures | |
union | adcontab |
A contingency table. More... | |
struct | adtree |
represents data for a 'query', e.g. X1=1,X3=2 More... | |
struct | scored_parentset |
Parent set with its score Note that the child variable is not represented. More... | |
struct | subset_tree |
stores a set of downward closed subsets of the variables in a tree More... | |
struct | varynode |
for splitting data on values of a particular variable (variable not stored in varynode) More... | |
Macros | |
#define | BLOCKSIZE 10000 |
#define | MAXARITY UCHAR_MAX |
#define | min(A, B) ((A) < (B) ? (A) : (B)) |
Typedefs | |
typedef union adcontab | ADCONTAB |
A contingency table. | |
typedef struct adtree | ADTREE |
An AD tree. | |
typedef unsigned char | ARITY |
Arity of a variable in the data. | |
typedef unsigned int | COUNT |
Count, typically of datapoints. | |
typedef unsigned int | ROW |
Index of a row (i.e. datapoint) in the data. | |
typedef double | SCORE |
A local score. More... | |
typedef double | SCOREARG |
An argument for a local score function. More... | |
typedef struct scored_parentset | SCORED_PARENTSET |
parent set with its local score | |
typedef struct subset_tree | SUBSET_TREE |
subset tree | |
typedef unsigned char | VALUE |
Value of a variable in the data. | |
typedef unsigned int | VARIABLE |
Variable in the data. | |
typedef struct varynode | VARYNODE |
A varynode (in an AD tree) | |
Functions | |
static void | add_subset (SUBSET_TREE *tree, SUBSET_TREE *bottom_ptr, VARIABLE *vars, int num_vars, VARIABLE offset, const VARIABLE nvars) |
Add a subset to the tree (if not already there ) More... | |
static SCIP_RETCODE | addGeneralDAGConstraints (SCIP *scip, ParentSetData *psd, int **is_parent) |
Reads in structural constraints to restrict which parent sets are considered. More... | |
static SCIP_RETCODE | addParentSets (SCIP *scip, VARIABLE child, SCORED_PARENTSET **parent_sets, int num_parent_sets, ParentSetData *psd, SCIP_Real ***scores) |
Record local scores for parent sets of a particular child. More... | |
static void | build_adtree (ADTREE *adtree, const VARIABLE variable, ROW *theserows, COUNT count, const int rmin, const int depth, int *n_nodes, const int adtreedepthlim, const int adtreenodeslim, const VARIABLE nvars, VALUE **data, ARITY *arity) |
Build an AD tree from (a subset of) the data. More... | |
static void | build_varynode (VARYNODE *varynode, VARIABLE variable, ROW *theserows, COUNT count, const int rmin, int depth, int *n_nodes, const int adtreedepthlim, const int adtreenodeslim, VARIABLE nvars, VALUE **data, ARITY *arity) |
Build an vary node from (a subset of) the data. More... | |
static SCIP_Bool | check_subset (SUBSET_TREE *tree, SUBSET_TREE *bottom_ptr, VARIABLE *vars, int num_vars, VARIABLE offset) |
Check whether a particular subset of variables is stored in a tree of subsets. More... | |
static SCIP_RETCODE | ci_constraint (SCIP *scip, ParentSetData *psd, const char *a_str, const char *b_str, const char *s_str, int **is_parent) |
Use a conditional independence (ci) constraint to rule out parents. More... | |
static void | delete_adtree (ADTREE *adtree, const VARIABLE variable, VARIABLE nvars, ARITY *arity) |
Delete an AD tree. More... | |
static void | delete_contab (ADCONTAB adcontab, VARIABLE *variables, int nvariables, const ARITY *arity) |
Delete a contingency table. More... | |
static void | delete_tree (SUBSET_TREE *tree, SUBSET_TREE *bottom_ptr, VARIABLE length) |
Delete a tree of subsets. More... | |
static SCORE | log_likelihood (const ADCONTAB *adcontab, VARIABLE *variables, int nvariables, const SCOREARG aijk, const SCORE lgaijk, COUNT *npos_cells_ptr, const ARITY *arity, SCIP_Bool fast) |
Compute the (marginal) log-likelihood for a subset of the variables (modulo a constant) More... | |
static SCORE | log_likelihood_cache (SCIP *scip, const ADTREE *adtree, VARIABLE *variables, int nvariables, COUNT *npos_cells_ptr, const int nvarscachelimit, const int cachesizelimit, const int cacheblocksize, int *cachesize_ptr, SCORE **llh_cache_ptr, COUNT **pos_cells_cache_ptr, const double alpha, const ARITY *arity, SCIP_Bool fast, VALUE **data, int *nsubsets) |
Compute the (marginal) log-likelihood for a subset of the variables and store in cache, or retrieve from cache if already computed. More... | |
static int | lookup (char **names, ARITY length, ARITY *used, char *name) |
Find the integer encoding for (the string representation of) a value of a variable. More... | |
static void | makecontab (const ADTREE *adtree, VARIABLE offset, VARIABLE *variables, int nvariables, ADCONTAB *adcontab, VALUE **data, const ARITY *arity) |
Construct a contingency table from an adtree. More... | |
static void | makecontableaf (const ROW *leaflist, const COUNT count, VARIABLE *variables, int nvariables, ADCONTAB *adcontab, VALUE **data, const ARITY *arity) |
Construct a contingency table from a leaflist ( contingency table must be for at least one variable ) More... | |
static SCIP_RETCODE | parseSet (SCIP *scip, ParentSetData *psd, const char *str, int *set, int *n_set, SCIP_Bool *success) |
Parses a string to extract a set from it. More... | |
static SCIP_RETCODE | process_constraint (SCIP *scip, ParentSetData *psd, const char *line, int **is_parent) |
Processes a constraint on the DAG structure. More... | |
static int | rank_subset (VARIABLE *variables, int nvariables, int *nsubsets) |
Map a subset of (BN) variables to a unique index. More... | |
static SCIP_RETCODE | readProblemFromFile (SCIP *scip, const char *filename, int num_delims, char *delims, SCIP_Bool merge_delims, char ***nodeNames, ARITY **arities, VALUE ***items, int *num_vars, int *num_rows, char ****labels, int **num_observed) |
Read discrete data from a file. More... | |
SCIP_RETCODE | SC_addScoringParameters (SCIP *scip) |
Add scoring parameters. More... | |
SCIP_RETCODE | SC_readProblemInDataFormat (SCIP *scip, const char *filename, int num_delims, char *delims, SCIP_Bool merge_delims, ParentSetData *psd, SCIP_Real ***scores, PropertyData *prop) |
Read data in from a file, generate and store local scores. More... | |
static int | sps_sort (const void *p1, const void *p2) |
Compare two scored parent sets according to score. More... | |
static void | subtract_contab (ADCONTAB *adcontab1, const ADCONTAB *adcontab2, VARIABLE *variables, int nvariables, const ARITY *arity) |
Subtract one contingency table from another. More... | |
Generates local scores from discrete data using AD trees.
typedef double SCORE |
A local score.
Currently only BDeu implemented
typedef double SCOREARG |
An argument for a local score function.
|
static |
Add a subset to the tree (if not already there )
tree | Set of subsets of variables |
bottom_ptr | Indicates non-null leaf nodes |
vars | The subset of variables to add |
num_vars | The number of variables in the subset |
offset | Offset for this node |
nvars | Number of variables in the data |
References build_varynode(), adtree::count, and scored_parentset::nvars.
|
static |
Reads in structural constraints to restrict which parent sets are considered.
(Altered version of function of same name in probdata_bn.c)
scip | The SCIP instance to add the constraint to. |
psd | The parent set data relating to this problem, only nodeNames is set. |
is_parent | is_parent[i][j] = 1 (-1) if j must (not) be a parent of i |
References process_constraint().
|
static |
Record local scores for parent sets of a particular child.
scip | SCIP data structure |
child | Child variable whose scored parent sets are being added |
parent_sets | Scored parent sets for child |
num_parent_sets | The number of scored parent sets for child |
psd | Parent set data to populate |
scores | (*scores)[child][j] will be the score for the jth parent set of child |
References ParentSetData::nParents, ParentSetData::nParentSets, scored_parentset::nvars, ParentSetData::ParentSets, scored_parentset::score, and scored_parentset::vars.
|
static |
Build an AD tree from (a subset of) the data.
adtree | pointer to ADTREE being built |
variable | first variable to specialise further on, if variable=nvars then there is none |
theserows | datapoint indices for this tree |
count | number of datapoints for this tree |
rmin | if count below this then create a leaflist |
depth | the depth of this node |
n_nodes | (pointer to) the number of nodes in the ADTREE |
adtreedepthlim | limit on the depth of the ADTREE |
adtreenodeslim | limit on the number of nodes in the ADTREE |
nvars | Number of variables in the data |
data | data[i][j] is the value of variable i in row j |
arity | arity[i] is the arity of variable i, |
References build_varynode(), adtree::children, adtree::count, adtree::leaflist, and scored_parentset::nvars.
Referenced by build_varynode().
|
static |
Build an vary node from (a subset of) the data.
varynode | varynode being built |
variable | which variable is being split |
theserows | datapoint indices to divide between values of variable |
count | number of datapoints for this tree |
depth | the depth of this node |
n_nodes | (pointer to) the number of nodes in the ADTREE |
adtreedepthlim | limit on the depth of the ADTREE |
adtreenodeslim | limit on the number of nodes in the ADTREE |
nvars | Number of variables in the data |
data | data[i][j] is the value of variable i in row j |
arity | arity[i] is the arity of variable i, |
References build_adtree(), varynode::children, adtree::count, and varynode::mcv.
Referenced by add_subset(), and build_adtree().
|
static |
Check whether a particular subset of variables is stored in a tree of subsets.
tree | Set of subsets of variables |
bottom_ptr | Indicates non-null leaf nodes |
vars | The subset of variables to look for |
num_vars | The number of variables in the subset |
offset | Offset for this node |
|
static |
Use a conditional independence (ci) constraint to rule out parents.
and are given as comma separated variable names
scip | The SCIP instance in which to add the constraint. |
psd | The parent set data for the problem. |
a_str | A set |
b_str | B set |
s_str | S (separator) set. ( not used ) |
is_parent | is_parent[i][j] is set to 1 (-1) if j can(not) be a parent of i |
References parseSet().
Referenced by process_constraint().
|
static |
Delete an AD tree.
adtree | pointer to ADTREE being deleted |
variable | first variable to specialise further on, if variable=nvars then there is none |
nvars | Number of variables in the data |
arity | arity[i] is the arity of variable i, |
References adtree::children, varynode::children, adtree::leaflist, and scored_parentset::nvars.
|
static |
Delete a contingency table.
The contingency table must have at least one variable (i.e. it cannot be a count), but this only checked for in DEBUG mode.
adcontab | Contingency table to delete |
variables | Variables in the contingency table |
nvariables | Number of variables in the contingency table (must be positive) |
arity | arity[i] is the arity of variable i, |
References adcontab::children.
Referenced by log_likelihood_cache().
|
static |
Delete a tree of subsets.
tree | Tree of subsets to delete |
bottom_ptr | Indicates non-null leaf nodes |
length | Number of children of this node |
|
static |
Compute the (marginal) log-likelihood for a subset of the variables (modulo a constant)
To get the true log-likelihood one would need to add lgamma(alpha) - log(alpha+N) to the result of this function, where N is the number of samples. Since local scores are always computed as this_function(family)-this_function(parents), there is no need to compute the constant term
adcontab | contingency table for variables |
variables | the subset of the variables |
nvariables | the size of the subset of the variables |
aijk | effective samples size (ESS) divided by the number of joint instantiations of the subset of variables |
lgaijk | log(Gamma(aijk)) |
npos_cells_ptr | pointer to number of cells in adcontab with a positive count |
arity | arity[i] is the arity of variable i, |
fast | If true then C's lgamma function is used for computing scores, otherwise a slower, more accurate sum of logs |
References adcontab::children, and adcontab::count.
Referenced by log_likelihood_cache().
|
static |
Compute the (marginal) log-likelihood for a subset of the variables and store in cache, or retrieve from cache if already computed.
scip | SCIP instance |
adtree | the data |
variables | the subset of the variables |
nvariables | the size of the subset of the variables |
npos_cells_ptr | pointer to number of cells in adcontab with a positive count |
nvarscachelimit | subsets must have size below this to be cached. |
cachesizelimit | the maximum number of log-likelihoods and pos_cell_cache values to cache (limit is common to both). |
cacheblocksize | how much to increase the size of the cache for log-likelihoods and pos_cell_cache values when it is too small (common to both). |
cachesize_ptr | (pointer to) the current size of the cache for log-likelihoods and pos_cell_cache values (common to both) |
llh_cache_ptr | (pointer to) llh_cache[r] is the log-likelihood for (the data projected onto ) the unique subset of variables with rank r |
pos_cells_cache_ptr | (pointer to) pos_cell_cache[r] is the number of non-zero counts in the contingency table for the unique subset of variables with rank r. A value of zero indicates that neither the correct count nor the the associated llh_cache[r] value has yet been computed |
alpha | The 'effective sample size' governing the BDeu score |
arity | arity[i] is the arity of variable i, |
fast | If true then C's lgamma function is used for computing scores, otherwise a slower, more accurate sum of logs |
data | data[i][j] is the value of variable i in row j |
nsubsets | nsubsets[nvariables] is how many susbsets have size strictly less than nvariables |
References delete_contab(), log_likelihood(), makecontab(), and rank_subset().
Find the integer encoding for (the string representation of) a value of a variable.
If the input string name
does not occurs in names
, then it will be added to it (as long as doing so would not exceed the arity), and (*used)
will be incremented.
length
. names | names [r] is (the string representation of) the rth value of the variable |
length | arity of the variable (number of values the variable has) |
used | the number of strings stored in names |
name | (string representation of) a value of a variable |
References BgeMatrixCreate(), and UT_readFileAndSplit().
|
static |
Construct a contingency table from an adtree.
adtree | (Pointer to) the ADTREE or NULL |
offset | Offset for first variable (to identify correct vary nodes) |
variables | Variables in the sought contingency table (sorted) |
nvariables | Number of variables in the contingency table |
adcontab | Returned contingency table |
data | data[i][j] is the value of variable i in row j |
arity | arity[i] is the arity of variable i, |
References adtree::children, varynode::children, adcontab::children, adtree::count, adcontab::count, adtree::leaflist, makecontableaf(), varynode::mcv, and subtract_contab().
Referenced by log_likelihood_cache().
|
static |
Construct a contingency table from a leaflist ( contingency table must be for at least one variable )
leaflist | datapoints for this query |
count | number of datapoints (in leaflist) |
variables | variables in the contingency table (sorted) |
nvariables | number of variables in the contigency table |
adcontab | (pointer to ) returned contingency table |
data | data[i][j] is the value of variable i in row j |
arity | arity[i] is the arity of variable i, |
References adtree::children, adcontab::children, adtree::count, and adcontab::count.
Referenced by makecontab().
|
static |
Parses a string to extract a set from it.
Copied from probdata_bn
scip | The SCIP instance that this belongs to. |
psd | The data set that the set refers to. |
str | string to parse |
set | result set |
n_set | length of set |
success | success flag |
References get_index().
Referenced by ci_constraint().
|
static |
Processes a constraint on the DAG structure.
scip | The SCIP instance in which to add the constraint. |
psd | The parent set data relating to this problem, only nodeNames is set. |
line | The description of the constraint to add. |
is_parent | is_parent[i][j] is set to 1 (-1) if j can(not) be a parent of i |
References ci_constraint(), get_index(), and ParentSetData::n.
Referenced by addGeneralDAGConstraints().
|
static |
Map a subset of (BN) variables to a unique index.
If A is a subset of B then rank(A) < rank(B)
variables | the subset of the variables |
nvariables | the size of the subset of the variables |
nsubsets | nsubsets[nvariables] is how many susbsets have size strictly less than nvariables |
Referenced by log_likelihood_cache().
|
static |
Read discrete data from a file.
The last 7 arguments are outputs for which we use pointers. This function allocates space for these outputs.
scip | SCIP data structure |
filename | File containing the data |
num_delims | The number of field delimiters to use |
delims | The field delimiters to use |
merge_delims | Whether multiple field delimiters should be merged in to one |
nodeNames | (Pointer to) node (i.e. variable) names |
arities | (Pointer to) node variable arities |
items | (Pointer to) the data. &c (*items)[i][j] is the integer-encoded value of the ith variable in the jth datapoint |
num_vars | (Pointer to) the number of variables |
num_rows | (Pointer to) the number of rows (i.e. datapoints) in the data |
labels | (Pointer to) a mapping from the integer encoding of a value to the corresponding string (*labels)[i][r] is the string corresponding to the rth value of the ith variable |
num_observed | (Pointer to) the number of distinct values observed in the data for each variable |
References UT_readFileAndSplit().
SCIP_RETCODE SC_addScoringParameters | ( | SCIP * | scip | ) |
Add scoring parameters.
scip | SCIP data structure |
References UT_addBoolParam(), UT_addIntParam(), and UT_addRealParam().
Referenced by BN_addParameters().
SCIP_RETCODE SC_readProblemInDataFormat | ( | SCIP * | scip, |
const char * | filename, | ||
int | num_delims, | ||
char * | delims, | ||
SCIP_Bool | merge_delims, | ||
ParentSetData * | psd, | ||
SCIP_Real *** | scores, | ||
PropertyData * | prop | ||
) |
Read data in from a file, generate and store local scores.
This function sets the global variables alpha, palim, fast, cachesizelimit, cacheblocksize, nvarscachelimit, nvars, arity, nrows and data.
< The 'effective sample size' governing the BDeu score
< Maximum allowed size for a parent set
< If true then C's lgamma function is used for computing scores, otherwise a slower, more accurate sum of logs
< the maximum number of log-likelihoods and pos_cell_cache values to cache (limit is common to both).
< how much to increase the size of the cache for log-likelihoods and pos_cell_cache values when it is too small (common to both).
< subsets must have size below this to be cached.
< arity[i] is the arity of variable i, (discrete)
< data[i][j] is the value of variable i in row j (discrete)
< Number of variables in the data
< Number of rows (i.e. datapoints) in the data
< the current size of the cache for log-likelihoods and pos_cell_cache values (common to both)
< llh_cache[r] is the log-likelihood for (the data projected onto ) the unique subset of variables with rank r
< pos_cell_cache[r] is the number of non-zero counts in the contingency table for the unique subset of variables with rank r.
< nsubsets[nvariables] is how many susbsets have size strictly less than nvariables
< Subset tree used to indicate leaf nodes. Its value is never set or used; it only exists for bottom_ptr
to have something to point to.
< Pointer to bottom.
scip | SCIP data structure |
filename | File containing the data |
num_delims | The number of field delimiters to use |
delims | The field delimiters to use |
merge_delims | Whether multiple field delimiters should be merged in to one |
psd | Parent set data to populate |
scores | (*scores)[child][j] will be the score for the jth parent set of child |
prop | Property data structure |
References scored_parentset::nvars.
Referenced by readProblemInNonCIPFormat().
|
static |
Compare two scored parent sets according to score.
For use with qsort, so arguments have type void*
p1 | First scored parent set |
p2 | Second scored parent set |
|
static |
Subtract one contingency table from another.
The two contingency tables must be over the same variables
adcontab1 | Contingency table from which adcontab2 will be subtracted |
adcontab2 | Contingency table which will be subtracted from adcontab1 |
variables | Variables common to both adcontab1 and adcontab2 |
nvariables | Number of variables |
arity | arity[i] is the arity of variable i, |
References adcontab::children, adtree::count, and adcontab::count.
Referenced by makecontab().