一)数据库准备
1、将WebPageInfo的URL字段设置为小于900,并且对其建立索引:
create index urlIndex on WebPageInfo(URL)
2、将AnchorUrl的URL字段和ANCHORURL字段设置为小于900,并建立索引:
CREATE INDEX [UrlAndAnchor] ON [dbo].[AnchorUrl]([URL], [ANCHORURL]) ON [PRIMARY]
GO
3、建立其他表资源
CREATE TABLE [IndexInfo] (
[WORDID] [int] IDENTITY (1, 1) NOT NULL ,
[KEYWORDS] [char] (1000) COLLATE Chinese_PRC_CI_AS NULL ,
[DOCID] [int] NULL ,
[POSITION] [int] NULL ,
[PR] [float] NULL ,
CONSTRAINT [PK_IndexInfo] PRIMARY KEY CLUSTERED
(
[WORDID]
) ON [PRIMARY]
) ON [PRIMARY]
GO
CREATE TABLE [CharInfo] (
[CHARID] [int] IDENTITY (1, 1) NOT NULL ,
[CHAR] [char] (2) COLLATE Chinese_PRC_CI_AS NULL ,
[DOCID] [int] NULL ,
[POSITION] [int] NULL ,
[PR] [float] NULL ,
CONSTRAINT [PK_CharInfo] PRIMARY KEY CLUSTERED
(
[CHARID]
) ON [PRIMARY]
) ON [PRIMARY]
GO
CREATE INDEX [IX_Char] ON [dbo].[CharInfo]([CHAR]) ON [PRIMARY]
GO
4、建立填充title的存储过程
create proc getTitles
as
begin
declare webpagecursor cursor
for SELECT substring(pagetext,1,4000) FROM WebPageInfo
declare @ptext nchar(4000)
open webpagecursor
fetch next from webpagecursor into @ptext
while (@@fetch_status=0)
begin
declare @start int
declare @end int
set @ptext=lower(@ptext)
set @start=CHARINDEX('<title>',@ptext,1)
set @end=CHARINDEX('</title>',@ptext,1)
print @start
print @end
UPDATE WebPageInfo SET title =substring(@ptext,@start+7,@end-@start-7) WHERE CURRENT OF webpagecursor
fetch next from webpagecursor into @ptext
end
close webpagecursor
deallocate webpagecursor
end
5、建立初始化PageRank值的存储过程
create proc initializePr
as
begin
declare @value float
select @value=count(*) from WebPageInfo
update WebPageInfo set pr=1/@value
end
6、建立准备计算PageRank的存储过程
CREATE proc prepareTempPr
as
begin
if exists (select * from sysobjects where id = object_id('dbo.temppr') and sysstat & 0xf = 3)
begin
drop table "dbo"."temppr"
end
if exists (select * from sysobjects where id = object_id('dbo.prdata') and sysstat & 0xf = 3)
begin
drop table "dbo"."prdata"
end
select AnchorUrl.docid,count(*) as 'count',min(pr) as 'pr' into temppr from AnchorUrl inner join WebPageInfo on WebPageInfo.docid= AnchorUrl.docid group by AnchorUrl.docid
select anchorurl.docid,anchorurl.url,anchorurl.anchordocid, anchorurl.anchorurl,count ,pr into prdata from anchorurl inner join temppr on anchorurl.docid=temppr.docid
end
go
7、建立计算PageRank的存储过程
create proc cumputePR
as
begin
set nocount on
DECLARE @totalCount int
set @totalCount=0
WHILE @totalCount<7
BEGIN
declare prcursor cursor
for select docid from temppr
declare @num int
open prcursor
fetch next from prcursor into @num
while (@@fetch_status=0)
begin
declare prcursor2 cursor
for select rtrim(url),pr,[count] from prdata where rtrim(anchordocid)=@num
declare @url char(200)
declare @prvalue float
declare @count int
open prcursor2
declare @pagerank float
set @pagerank=0.0
fetch next from prcursor2 into @url,@prvalue,@count
while (@@fetch_status=0)
begin
set @pagerank=@pagerank+@prvalue/@count
fetch next from prcursor2 into @url,@prvalue,@count
end
print @pagerank
update temppr set pr=pr+@pagerank where docid=@num
close prcursor2
deallocate prcursor2
print @num
print '+++++++++++++'
fetch next from prcursor into @num
end
declare @newprvalue float
select @newprvalue=sum(pr) from temppr
update temppr set pr=pr/@newprvalue
update prdata set pr=(select pr from temppr where temppr.docid=prdata.docid)
close prcursor
deallocate prcursor
set @totalCount=@totalCount+1
END
end
update WebPageInfo set pr=(select pr from temppr where temppr.docid=WebPageInfo.docid)
GO
8、建立提取关键词的存储过程
create proc getKeywords
as begin
declare kwcursor cursor
for SELECT docid,substring(pagetext,1,4000) FROM WebPageInfo
declare @did int
declare @ptext nchar(4000)
open kwcursor
fetch next from kwcursor into @did,@ptext
while (@@fetch_status=0)
begin
declare @start int
declare @end int
set @start=0
set @end=0
while(1=1)
begin
set @start=CHARINDEX('>',@ptext,@end)
set @end=CHARINDEX('<',@ptext,@start)
if @start=0 or @end=0
break
declare @str char(1000)
set @str=substring(@ptext,@start+1,@end-@start-1)
if ascii(ltrim(rtrim(@str)))!=13 and ascii(ltrim(rtrim(@str)))!=10 and ltrim(rtrim(@str))!=''
insert into IndexInfo(KEYWORDS,DOCID,[POSITION]) values(@str,@did,@start)
end
fetch next from kwcursor into @did,@ptext
end
close kwcursor
deallocate kwcursor
update IndexInfo set pr=(select pr from WebPageInfo where WebPageInfo.docid=IndexInfo.docid)
end
go
9、建立全文索引的存储过程
CREATE proc getChar
as
begin
set nocount on
declare indexcursor cursor
for select wordid,keywords,docid,position,pr from IndexInfo
declare @wid int
declare @word char(1000)
declare @did int
declare @pos int
declare @pr float
open indexcursor
fetch next from indexcursor into @wid,@word,@did,@pos,@pr
while (@@fetch_status=0)
begin
declare @count int
set @count=1
while(@count<=len(@word))
begin
declare @chinword char(2)
set @chinword=substring(@word,@count,1)
if(@chinword!=' ')
insert into CharInfo([char],docid,position,pr) values(@chinword,@did,@pos+@count,@pr)
set @count=@count+1
end
fetch next from indexcursor into @wid,@word,@did,@pos,@pr
end
close indexcursor
deallocate indexcursor
drop index CharInfo.IX_CharInfo
CREATE UNIQUE INDEX [IX_CharInfo] ON [dbo].[CharInfo]([CHARID]) ON [PRIMARY]
end
GO
10、执行process程序
二)Process程序
import java.sql.*;
import java.sql.*;
//运行此程序,无需任何额外准备操作,但是以前的IndexInfo和CharInfo表记录没有自动删除
public class Process
{
public static Connection con;
//填充标题字段
public static void fillTitles()
{
CallableStatement cstm;
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
cstm=con.prepareCall("{call getTitles}");
cstm.execute();
cstm.close();
con.close();
}
catch(Exception e)
{
System.out.println(e.getMessage());
}
}
//填充ANCHORDOCID
public static void fillAnchorDocID()
{
Statement stm;
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
stm=con.createStatement();
stm.executeUpdate("update anchorurl set anchordocid=(select docid from WebPageInfo where url=anchorurl.anchorurl)");
con.close();
}
catch(Exception e)
{
System.out.println(e.getMessage());
}
}
public static void computePageRankOfAllPages()
{
//初始化pagerank值
CallableStatement cstm;
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
cstm=con.prepareCall("{call initializePr}");
cstm.execute();
cstm.close();
con.close();
}
catch(Exception e)
{
System.out.println(e.getMessage());
}
//构造临时表tempPr
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
cstm=con.prepareCall("{call prepareTempPr}");
cstm.execute();
cstm.close();
cstm=con.prepareCall("{call cumputePR}");
cstm.execute();
cstm.close();
con.close();
}
catch(Exception e)
{
System.out.println(e.getMessage());
}
}
public static void getCharInKeyword()
{
//生成关键词序列和词语序列
CallableStatement cstm;
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
cstm=con.prepareCall("{call getKeywords}");
cstm.execute();
cstm.close();
cstm=con.prepareCall("{call getChar}");
cstm.execute();
cstm.close();
con.close();
}
catch(Exception e)
{
System.out.println(e.getMessage());
}
}
public static void main(String [] args)
{
Process.fillTitles();
Process.fillAnchorDocID();
Process.computePageRankOfAllPages();
//Process.getCharInKeyword();
}
}
[此贴子已经被作者于2010-12-14 09:26:07编辑过]