-- 作者:admin
-- 发布时间:2008/4/27 7:30:38
-- 结合SQL Server进行的搜狗部分公开数据的实验方法
1、用户查询日志
1、准备代码 CREATE DATABASE [UserQuery] ON (NAME = N\'UserQuery_Data\', FILENAME = N\'C:\\Program Files\\Microsoft SQL Server\\MSSQL\\data\\UserQuery_Data.MDF\' , SIZE = 1, FILEGROWTH = 10%) LOG ON (NAME = N\'UserQuery_Log\', FILENAME = N\'C:\\Program Files\\Microsoft SQL Server\\MSSQL\\data\\UserQuery_Log.LDF\' , SIZE = 1, FILEGROWTH = 10%) COLLATE Chinese_PRC_CI_AS GO
CREATE TABLE [LogFile] ( [用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL , [查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL , [数值] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL , [点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL , ) GO
CREATE TABLE [LogInfo] ( [id] [int] IDENTITY (1, 1) NOT NULL , [用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL , [查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL , [URL结果排名] [int] NULL , [点击序号] [int] NULL , [点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL , CONSTRAINT [PK_TABLE1] PRIMARY KEY CLUSTERED ( [id] ) ON [PRIMARY] ) ON [PRIMARY] GO
[LogFile] 生成 [LogInfo] insert into loginfo select [用户],substring([查询词],2,len([查询词])-2),CONVERT(int, substring([数值],1,charindex(\' \',[数值])-1)),CONVERT(int, substring([数值],charindex(\' \',[数值])+1,10)),[点击URL] from logfile
可以加上各个字段的索引
2、分析 1)查看用户查询情况 select top 100 * from loginfo order by [用户],[点击序号]
2)查看用户会话情况 select top 100 用户,count(*) from loginfo group by 用户 order by count(*) desc
select count(distinct 用户) from loginfo
select * from loginfo where 用户=\'13505142608748166\'
select * from loginfo where 用户=\'13505142608748166\' order by 查询词,点击序号
select 用户 from loginfo group by 用户 having max(点击序号)=count(*) order by count(*)
select 用户 from loginfo group by 用户 having max(点击序号)=count(*) and count(*)>100 order by count(*)
3)查询词语情况 select top 100 查询词,count(*) from loginfo group by 查询词 order by count(*) desc select count(distinct 查询词) from loginfo select * from loginfo where 查询词=\'张玉凤\' order by 用户
select avg(len(查询词)) from loginfo select len(查询词),count(*) from loginfo group by len(查询词) order by count(*) desc
select top 100 查询词,count(distinct 用户) from loginfo group by 查询词 order by count(distinct 用户) desc
4)查询浏览结果的情况 select URL结果排名,count(*) from loginfo group by URL结果排名 order by count(*) desc select 点击序号,count(*) from loginfo group by 点击序号 order by count(*) desc
select 点击序号,count(*) as \'count\' into tempt from loginfo group by 点击序号 order by count(*) desc select round(count,-4),count(点击序号) from tempt group by round(count,-4) order by count(点击序号) desc select * from tempt
2、词库分析
1、数据准备 CREATE TABLE [lexiconTemp] ( [词语] [char] (50) COLLATE Chinese_PRC_CI_AS NULL , [词频] [int] NULL , [词性] [char] (20) COLLATE Chinese_PRC_CI_AS NULL ,
) GO
添加主键即可
2、分析 1)查看词语分析情况 select len(词语),count(*) from lexicon group by len(词语) order by count(*) desc select len(词频),count(*) from lexicon group by len(词频) order by count(*) desc select 词性,count(*) from lexicon group by 词性 order by count(*) desc
2)查看不同词语的词性分布情况 select 词性,count(*) from lexicon where len(词语)=2 group by 词性 order by count(*) desc select 词性,count(*) from lexicon where 词频<100000 group by 词性 order by count(*) desc
3、词语组合分析
1、数据准备 CREATE TABLE [WordPair] ( [词语组合] [char] (50) COLLATE Chinese_PRC_CI_AS NULL , [词频] [int] NULL ) ON [PRIMARY] GO
2、分析 1)出现最多的组合 select top 1000 * from wordpair order by 词频 desc
大家可以自行练习,并做出自己的实验分析结果
[此贴子已经被作者于2010-12-14 09:21:56编辑过]
|