1、用户查询日志
1、准备代码
CREATE DATABASE [UserQuery] ON (NAME = N'UserQuery_Data', FILENAME = N'C:\Program Files\Microsoft SQL Server\MSSQL\data\UserQuery_Data.MDF' , SIZE = 1, FILEGROWTH = 10%) LOG ON (NAME = N'UserQuery_Log', FILENAME = N'C:\Program Files\Microsoft SQL Server\MSSQL\data\UserQuery_Log.LDF' , SIZE = 1, FILEGROWTH = 10%)
COLLATE Chinese_PRC_CI_AS
GO
CREATE TABLE [LogFile] (
[用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[数值] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL ,
)
GO
CREATE TABLE [LogInfo] (
[id] [int] IDENTITY (1, 1) NOT NULL ,
[用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[URL结果排名] [int] NULL ,
[点击序号] [int] NULL ,
[点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL ,
CONSTRAINT [PK_TABLE1] PRIMARY KEY CLUSTERED
(
[id]
) ON [PRIMARY]
) ON [PRIMARY]
GO
[LogFile] 生成 [LogInfo]
insert into loginfo select [用户],substring([查询词],2,len([查询词])-2),CONVERT(int, substring([数值],1,charindex(' ',[数值])-1)),CONVERT(int, substring([数值],charindex(' ',[数值])+1,10)),[点击URL] from logfile
可以加上各个字段的索引
2、分析
1)查看用户查询情况
select top 100 * from loginfo order by [用户],[点击序号]
2)查看用户会话情况
select top 100 用户,count(*) from loginfo group by 用户 order by count(*) desc
select count(distinct 用户) from loginfo
select * from loginfo where 用户='13505142608748166'
select * from loginfo where 用户='13505142608748166' order by 查询词,点击序号
select 用户 from loginfo group by 用户 having max(点击序号)=count(*) order by count(*)
select 用户 from loginfo group by 用户 having max(点击序号)=count(*) and count(*)>100 order by count(*)
3)查询词语情况
select top 100 查询词,count(*) from loginfo group by 查询词 order by count(*) desc
select count(distinct 查询词) from loginfo
select * from loginfo where 查询词='张玉凤' order by 用户
select avg(len(查询词)) from loginfo
select len(查询词),count(*) from loginfo group by len(查询词) order by count(*) desc
select top 100 查询词,count(distinct 用户) from loginfo group by 查询词 order by count(distinct 用户) desc
4)查询浏览结果的情况
select URL结果排名,count(*) from loginfo group by URL结果排名 order by count(*) desc
select 点击序号,count(*) from loginfo group by 点击序号 order by count(*) desc
select 点击序号,count(*) as 'count' into tempt from loginfo group by 点击序号 order by count(*) desc
select round(count,-4),count(点击序号) from tempt group by round(count,-4) order by count(点击序号) desc
select * from tempt
2、词库分析
1、数据准备
CREATE TABLE [lexiconTemp] (
[词语] [char] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[词频] [int] NULL ,
[词性] [char] (20) COLLATE Chinese_PRC_CI_AS NULL ,
)
GO
添加主键即可
2、分析
1)查看词语分析情况
select len(词语),count(*) from lexicon group by len(词语) order by count(*) desc
select len(词频),count(*) from lexicon group by len(词频) order by count(*) desc
select 词性,count(*) from lexicon group by 词性 order by count(*) desc
2)查看不同词语的词性分布情况
select 词性,count(*) from lexicon where len(词语)=2 group by 词性 order by count(*) desc
select 词性,count(*) from lexicon where 词频<100000 group by 词性 order by count(*) desc
3、词语组合分析
1、数据准备
CREATE TABLE [WordPair] (
[词语组合] [char] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[词频] [int] NULL
) ON [PRIMARY]
GO
2、分析
1)出现最多的组合
select top 1000 * from wordpair order by 词频 desc
大家可以自行练习,并做出自己的实验分析结果
[此贴子已经被作者于2010-12-14 09:21:56编辑过]