Apriori算法实验报告
Apriori算法实验报告
一、Apriori算法的说明
在Apriori算法中,寻找最大项目集的基本思想是: 算法需要对数据集进行多步处理.第一步,简单统计所有含一个元素项目集出现的频率,并找出那些不小于最小支持度的项目集, 即一维最大项目集. 从第二步开始循环处理直到再没有最大项目集生成. 循环过程是: 第k步中, 根据第k-1步生成的(k-1)维最大项目集产生k维侯选项目集, 然后对数据库进行搜索, 得到侯选项目集的项集支持度, 与最小支持度比较, 从而找到k维最大项目集.
二、Apriori算法思想
1、主要思想就是发现频繁项目集,和生成关联规则。
程序的主要过程函数:
A.由Tid生成C1(单独生成)。
B.由Ck生成Lk。
结束之前调用打印函数print,打印出Lk,并判断是否结束调用函数Lk-1生成Ck。(结束条件是support.size() == 1。)
C.由Lk-1生成Ck。
结束之前调用打印函数print,打印出Ck,记录次数(times)加一,并调用CK生成LK函数。
2、源程序使用的数据结构
程序主要用的是C++的vector 和list模版。
●vector
●list
3、数据集
有一数据库D, 其中有四个事务记录, 分别表示为
TID Items
T1I1,I3,I4
T2I2,I3,I5
T3I1,I2,I3,I5
T4I2,I5
5.源程序
#include
#include
#include
using namespace std;
void print(list
void Lk_Ck(list
vector
int min_sup;
double support, minconfidence;
int times = 1;
vector
void init(list
{
a[0].push_back("I1");
a[0].push_back("I2");
a[0].push_back("I5");
a[1].push_back("I2");
a[1].push_back("I4");
a[2].push_back("I2");
a[2].push_back("I3");
a[3].push_back("I1");
a[3].push_back("I2");
a[3].push_back("I4");
a[4].push_back("I1");
a[4].push_back("I3");
a[5].push_back("I2");
a[5].push_back("I3");
a[6].push_back("I1");
a[6].push_back("I3");
a[7].push_back("I1");
a[7].push_back("I2");
a[7].push_back("I3");
a[7].push_back("I5");
a[8].push_back("I1");
a[8].push_back("I2");
a[8].push_back("I3");
}
void first_c(list
{
list
int i = 0;
for(; i < tid_len; i++)
{
for(iter = tid[i].begin(); iter != tid[i].end(); iter++)
{
c1.push_back(*iter);
}
}
i = 0;
for(iter = c1.begin(); iter != c1.end(); i++)
{
iter_old = iter;
first_sup.push_back(1);
for(iter_tmp = ++iter; iter_tmp != c1.end();)
{
iter_tmp_old = iter_tmp++;
if(strcmp(*iter_old, *iter_tmp_old) == 0)
{
first_sup[i]++;
c1.erase(iter_tmp_old);
}
}
iter = ++iter_old;
}
printf("C%d\tsup\n", times);
print(c1);
}
// k-侯选集产生k-频集.
void Ck_Lk(list
{
list
lk.clear();
for(iter = c.begin(); iter != c.end(); iter++)
{
lk.push_back(*iter);
}
l.clear();
int i, j, k;
for(iter = c.begin(), i = 0; i < (int)sup.size();)
{
if(sup[i] < min_sup)
{
for(j = 0; j < times; j++)
{
iter_old = iter++;
c.erase(iter_old);
}
if(++i != sup.size())
{
for(k = i-1; k < (int)sup.size() - 1; k++)
sup[k] = sup[k+1];
}
int *p=sup.end();
sup.erase(--p);
i = i-1;
}
else
{
for(j = 0; j < times; j++)
{
iter++;
}
i++;
}
}
vector
for(iter = c.begin(), iter_vect = sup.begin(); iter_vect != sup.end(); )
{
if(*iter_vect < 2)
{
for(j = 0; j < times; j++)
{
iter_old = iter++; //
c.erase(iter_old);
}
iter_vect_old = iter_vect++; // 容器和list 删除元素有差异?! 删除iter_vect_old后,iter_vect的值怎么还原为iter_vect_old?!!
sup.erase(iter_vect_old);
}
else
{
for(j = 0; j < times; j++)
{
iter++;
}
iter_vect++;
}
}
for(iter = c.begin(); iter != c.end(); iter++)
{
l.push_back(*iter);
}
if(sup.size() > 1)
{
printf("L%d\n", times);
print(l);
Lk_Ck(c, l, tid, tid_num);
}
}
// times - 1 次比较
int my_compare(list
{
int i = times - 1;
while(i--)
{
if(strcmp(*(iter_pre++), *(iter_cur++)) != 0)
return 0;
}
return 1;
}
bool mycompare(list
{
int i = times, j = 0;
list
while(i--)
{
for(iter_tid = t.begin(); iter_tid != t.end(); iter_tid++)
if(strcmp(*iter_tid , *iter) == 0)
{
j++;
break;
}
}
iter++;
}
if(j == times)
{
return true;
}
else
return false;
}
//(k-1)-频集产生k-侯选集.
void Lk_Ck(list
{
c.clear();
int i, j;
list
for(iter_pre = l.begin(); iter_pre != l.end();)
{
i = times;
iter_cur = iter_pre;
while(i--)
{
if(iter_cur == l.end())
break;
iter_cur++;
}
if(iter_cur == l.end())
break;
for(iter_cur; iter_cur != l.end();)
list
if(my_compare(tmp_iter_pre, tmp_iter_cur) == 1)
{
tmp_iter_pre = iter_pre, tmp_iter_cur = iter_cur;
for(j = 0; j <= times; j++)
{
if(j == times)
{
i = times -1;
while(i--)
{
tmp_iter_cur++;
}
c.push_back(*tmp_iter_cur);
}
else
{
c.push_back(*(tmp_iter_pre++));
}
}
}
i = times;
while(i--)
{
iter_cur++;
}
}
i = times;
while(i--)
{
iter_pre++;
}
}
times++;
sup.clear();
list
int len = 0;
for(iter = c.begin(); iter != c.end(); len++)
{
i = times;
while(i--)
{
iter++;
}
}
sup.assign(len, 0); // len 候选集生成个数。
for(i = 0; i < tid_num; i++) // 待优化自定义输入时,输入个数替换;
{
j = 0;
for(iter = c.begin(); iter != c.end(); j++)
{
if(mycompare(tid[i], iter) == true)
{
sup[j]++;
}
int k = times;
while(k--)
{
iter++;
}
}
}
printf("C%d\n", times);
print(c);
Ck_Lk(c, l, tid, tid_num);
}
int Apriori_compare_fenzi(list
{
int j = 0, count = 0;
list
for(iter = t.begin(); iter != t.end(); iter++)
{
for(j = 0; j < (int)lk.size(); j++)
{
if(strcmp(lk[j], *iter) == 0)
{
count++;
break;
}
}
}
if(count == times)
return 1;
else
return 0;
}
int Apriori_compare(list
{
int i = 0, j = 0, count = 0;
list
for(iter = t.begin(); iter != t.end(); iter++)
{
j = len;
i = begin_flag;
do{
if(strcmp(*iter, l[i]) == 0)
count++;
i++;
}while(--j);
}
if(count == len)
return 1;
else
return 0;
}
void print_Apriori(vector
int i = 0, j = 1;
char *buf[] = {"是","否"};
if(((fenzi/fenmu) - minconfidence) > 0.000001)
j = 0;
for(; i < times + 1; i++)
{
printf("%s ", l[m + i]);
}
printf("\t%0.2f\t%0.2f\t%s", fenzi/fenmu, support, buf[j]);
printf("\n");
}
void LK_Apriori(list
{
vector
int i = 0, j = 0, z = 0, k = 2, m = 0;
double count_fenzi = 0, count_fenmu = 0;
l.clear();
for(; i < (int)lk.size(); i++) // 1 => (times - 1) {
l.push_back(lk[i]);
l.push_back("=>");
for(j = 0; j < (int)lk.size(); j++)
{
if( j != i)
l.push_back(lk[j]);
}
}
j = (int)l.size(); // (times -1) => 1
for(i = 0; i < times; i++)
{
z = times;
while(--z)
{
l.push_back(l[k]);
k++;
}
k += 2;
l.push_back("=>");
l.push_back(l[m]);
m += (times +1);
}
printf("关联规则\t可信度\t支持度\t强规则?!\n");
m = 0;
for(z = 0; z < 2 * times; z++)
{
if((z * (times + 1))< j)
{
for(i = 0; i < tid_num; i++)
{
count_fenmu += Apriori_compare(tid[i], l, m, 1);
count_fenzi += Apriori_compare_fenzi(tid[i]);
}
print_Apriori(l, m, count_fenzi, count_fenmu);
}
else
{
for(i = 0; i < tid_num; i++)
{
count_fenmu += Apriori_compare(tid[i], l, m, times - 1);
count_fenzi += Apriori_compare_fenzi(tid[i]);
}
print_Apriori(l, m, count_fenzi, count_fenmu);
}
m += (times + 1);
}
}
void print(list
{
list
int i, j = 0;
for(iter = ItemSet.begin(); iter != ItemSet.end(); j++)
{
i = times;
while(i--)
{
printf("%s\t", *iter);
iter++;
}
printf("%d\n", sup[j]);
}
printf("\n");
}
int main()
{
list
init(tid);
int tid_num = 9;
support = 0.5;
minconfidence = 0.33;
min_sup = tid_num * support;
first_c(tid, c, sup, tid_num);
Ck_Lk(c, l, tid, tid_num); // 循环生成k-候选集。
LK_Apriori(tid, tid_num); // 候选集生成关联规则
return 0; }
6.运行结果: