-- 作者:admin
-- 发布时间:2008/5/6 5:37:38
-- 可以计算PageRank等操作的Java代码和相应存储过程
一)数据库准备
1、将WebPageInfo的URL字段设置为小于900,并且对其建立索引: create index urlIndex on WebPageInfo(URL)
2、将AnchorUrl的URL字段和ANCHORURL字段设置为小于900,并建立索引: CREATE INDEX [UrlAndAnchor] ON [dbo].[AnchorUrl]([URL], [ANCHORURL]) ON [PRIMARY] GO
3、建立其他表资源 CREATE TABLE [IndexInfo] ( [WORDID] [int] IDENTITY (1, 1) NOT NULL , [KEYWORDS] [char] (1000) COLLATE Chinese_PRC_CI_AS NULL , [DOCID] [int] NULL , [POSITION] [int] NULL , [PR] [float] NULL , CONSTRAINT [PK_IndexInfo] PRIMARY KEY CLUSTERED ( [WORDID] ) ON [PRIMARY] ) ON [PRIMARY] GO
CREATE TABLE [CharInfo] ( [CHARID] [int] IDENTITY (1, 1) NOT NULL , [CHAR] [char] (2) COLLATE Chinese_PRC_CI_AS NULL , [DOCID] [int] NULL , [POSITION] [int] NULL , [PR] [float] NULL , CONSTRAINT [PK_CharInfo] PRIMARY KEY CLUSTERED ( [CHARID] ) ON [PRIMARY] ) ON [PRIMARY] GO
CREATE INDEX [IX_Char] ON [dbo].[CharInfo]([CHAR]) ON [PRIMARY] GO
4、建立填充title的存储过程 create proc getTitles as begin declare webpagecursor cursor for SELECT substring(pagetext,1,4000) FROM WebPageInfo
declare @ptext nchar(4000) open webpagecursor fetch next from webpagecursor into @ptext while (@@fetch_status=0) begin declare @start int declare @end int set @ptext=lower(@ptext) set @start=CHARINDEX(\'<title>\',@ptext,1) set @end=CHARINDEX(\'</title>\',@ptext,1) print @start print @end UPDATE WebPageInfo SET title =substring(@ptext,@start+7,@end-@start-7) WHERE CURRENT OF webpagecursor fetch next from webpagecursor into @ptext end close webpagecursor deallocate webpagecursor end
5、建立初始化PageRank值的存储过程 create proc initializePr as begin declare @value float select @value=count(*) from WebPageInfo update WebPageInfo set pr=1/@value end
6、建立准备计算PageRank的存储过程 CREATE proc prepareTempPr as begin if exists (select * from sysobjects where id = object_id(\'dbo.temppr\') and sysstat & 0xf = 3) begin drop table "dbo"."temppr" end if exists (select * from sysobjects where id = object_id(\'dbo.prdata\') and sysstat & 0xf = 3) begin drop table "dbo"."prdata" end
select AnchorUrl.docid,count(*) as \'count\',min(pr) as \'pr\' into temppr from AnchorUrl inner join WebPageInfo on WebPageInfo.docid= AnchorUrl.docid group by AnchorUrl.docid select anchorurl.docid,anchorurl.url,anchorurl.anchordocid, anchorurl.anchorurl,count ,pr into prdata from anchorurl inner join temppr on anchorurl.docid=temppr.docid end go
7、建立计算PageRank的存储过程 create proc cumputePR as begin set nocount on DECLARE @totalCount int set @totalCount=0 WHILE @totalCount<7 BEGIN declare prcursor cursor for select docid from temppr declare @num int open prcursor fetch next from prcursor into @num while (@@fetch_status=0) begin declare prcursor2 cursor for select rtrim(url),pr,[count] from prdata where rtrim(anchordocid)=@num declare @url char(200) declare @prvalue float declare @count int open prcursor2 declare @pagerank float set @pagerank=0.0 fetch next from prcursor2 into @url,@prvalue,@count while (@@fetch_status=0) begin set @pagerank=@pagerank+@prvalue/@count fetch next from prcursor2 into @url,@prvalue,@count end print @pagerank update temppr set pr=pr+@pagerank where docid=@num close prcursor2 deallocate prcursor2 print @num print \'+++++++++++++\' fetch next from prcursor into @num end declare @newprvalue float select @newprvalue=sum(pr) from temppr update temppr set pr=pr/@newprvalue
update prdata set pr=(select pr from temppr where temppr.docid=prdata.docid) close prcursor deallocate prcursor set @totalCount=@totalCount+1 END end
update WebPageInfo set pr=(select pr from temppr where temppr.docid=WebPageInfo.docid)
GO
8、建立提取关键词的存储过程 create proc getKeywords as begin declare kwcursor cursor for SELECT docid,substring(pagetext,1,4000) FROM WebPageInfo
declare @did int declare @ptext nchar(4000) open kwcursor fetch next from kwcursor into @did,@ptext while (@@fetch_status=0) begin declare @start int declare @end int set @start=0 set @end=0 while(1=1) begin set @start=CHARINDEX(\'>\',@ptext,@end) set @end=CHARINDEX(\'<\',@ptext,@start) if @start=0 or @end=0 break declare @str char(1000) set @str=substring(@ptext,@start+1,@end-@start-1) if ascii(ltrim(rtrim(@str)))!=13 and ascii(ltrim(rtrim(@str)))!=10 and ltrim(rtrim(@str))!=\'\' insert into IndexInfo(KEYWORDS,DOCID,[POSITION]) values(@str,@did,@start) end fetch next from kwcursor into @did,@ptext end close kwcursor deallocate kwcursor
update IndexInfo set pr=(select pr from WebPageInfo where WebPageInfo.docid=IndexInfo.docid) end
go
9、建立全文索引的存储过程 CREATE proc getChar as begin set nocount on declare indexcursor cursor for select wordid,keywords,docid,position,pr from IndexInfo declare @wid int declare @word char(1000) declare @did int declare @pos int declare @pr float
open indexcursor fetch next from indexcursor into @wid,@word,@did,@pos,@pr while (@@fetch_status=0) begin declare @count int set @count=1 while(@count<=len(@word)) begin declare @chinword char(2) set @chinword=substring(@word,@count,1) if(@chinword!=\' \') insert into CharInfo([char],docid,position,pr) values(@chinword,@did,@pos+@count,@pr) set @count=@count+1 end fetch next from indexcursor into @wid,@word,@did,@pos,@pr end close indexcursor deallocate indexcursor
drop index CharInfo.IX_CharInfo CREATE UNIQUE INDEX [IX_CharInfo] ON [dbo].[CharInfo]([CHARID]) ON [PRIMARY] end GO
10、执行process程序
二)Process程序
import java.sql.*; import java.sql.*;
//运行此程序,无需任何额外准备操作,但是以前的IndexInfo和CharInfo表记录没有自动删除 public class Process { public static Connection con; //填充标题字段 public static void fillTitles() { CallableStatement cstm; try { Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa",""); cstm=con.prepareCall("{call getTitles}"); cstm.execute(); cstm.close(); con.close(); } catch(Exception e) { System.out.println(e.getMessage()); } } //填充ANCHORDOCID public static void fillAnchorDocID() { Statement stm; try { Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa",""); stm=con.createStatement(); stm.executeUpdate("update anchorurl set anchordocid=(select docid from WebPageInfo where url=anchorurl.anchorurl)"); con.close(); } catch(Exception e) { System.out.println(e.getMessage()); } } public static void computePageRankOfAllPages() { //初始化pagerank值 CallableStatement cstm; try { Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa",""); cstm=con.prepareCall("{call initializePr}"); cstm.execute(); cstm.close(); con.close(); } catch(Exception e) { System.out.println(e.getMessage()); } //构造临时表tempPr try { Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa",""); cstm=con.prepareCall("{call prepareTempPr}"); cstm.execute(); cstm.close(); cstm=con.prepareCall("{call cumputePR}"); cstm.execute(); cstm.close(); con.close(); } catch(Exception e) { System.out.println(e.getMessage()); } } public static void getCharInKeyword() { //生成关键词序列和词语序列 CallableStatement cstm; try { Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver"); con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa",""); cstm=con.prepareCall("{call getKeywords}"); cstm.execute(); cstm.close(); cstm=con.prepareCall("{call getChar}"); cstm.execute(); cstm.close(); con.close(); } catch(Exception e) { System.out.println(e.getMessage()); } } public static void main(String [] args) { Process.fillTitles(); Process.fillAnchorDocID(); Process.computePageRankOfAllPages(); //Process.getCharInKeyword(); } }
[此贴子已经被作者于2010-12-14 09:26:07编辑过]
|