CREATE TABLE [AnchorUrl] (
[ANCHORID] [int] IDENTITY (1, 1) NOT NULL ,
[DOCID] [int] NULL ,
[URL] [varchar] (900) NULL ,
[ANCHORDOCID] [int] NULL ,
[ANCHORURL] [varchar] (900) NULL ,
[ANCHORTEXT] [varchar] (200) NULL ,
CONSTRAINT [PK_AnchorUrl] PRIMARY KEY CLUSTERED
(
[ANCHORID]
) ON [PRIMARY]
) ON [PRIMARY]
GO
CREATE TABLE [WaitingURL] (
[uid] [int] IDENTITY (1, 1) NOT NULL ,
[url] [varchar] (900) COLLATE Chinese_PRC_CI_AS NULL ,
[inflag] [int] NULL ,
[flag] [int] NULL CONSTRAINT [DF__WaitingURL__flag__47DBAE45] DEFAULT ((-1)),
CONSTRAINT [PK_WaitingURL] PRIMARY KEY CLUSTERED
(
[uid]
) ON [PRIMARY]
) ON [PRIMARY]
GO
CREATE INDEX [IX_WaitingURL] ON [dbo].[WaitingURL]([url])
go
2、将SQL Server的JDBC驱动设置为classpath,或者直接放入<JDK5>\\jre\\lib\\ext中(如:C:\\Program Files\\Java\\jdk1.5.0_14\\jre\\lib\\ext)
3、爬虫程序
import java.io.*;
import java.net.*;
import java.sql.*;
import java.util.*;
public class Spider
{
//所有正在等待处理和已经处理完毕的URL队列MD5值,只增不减,用于标记所有访问过的网页
Hashtable completePages = new Hashtable();
//所有等待处理的URL队列,既增也减,用于标记需要处理的网页
Vector waitingPages = new Vector();
//连接数据库
Connection con;
//网页索引号
Integer docIDCount;
//线程个数
int threadCount=0;
//用于同步的对象标志变量
Object lock=new Object();
public Spider(String startURL, int threadCount)
{
//初始化内存存储向量
synchronized(lock)
{
waitingPages.addElement(startURL);
String beginUrlMD5=MD5(startURL);
completePages.put(beginUrlMD5,beginUrlMD5);
}
try
{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
//设置线程开启数量
this.threadCount=threadCount;
//启动核心爬虫函数
process();
}
//核心爬虫函数
public void process()
{
//初始化DocID,它为现有DocID的最大值,如果没有任何记录,默认为1
try
{
Statement stm=con.createStatement();
ResultSet res=stm.executeQuery("select max(docid) from WebPageInfo");
res.next();
int resdocid=res.getInt(1);
if(resdocid==0)
docIDCount=new Integer(1);
else
docIDCount=new Integer(resdocid+1);
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
//开始启动线程
System.out.println("Begin...");
for(int i =0;i<threadCount;i++)
{
//线程号
final int num=i;
//新建线程匿名类
Runnable spiderthread=new Runnable()
{
//URL前缀长度
final int hrefLen="href=\\"".length();
//更新网页表
PreparedStatement pstmWebPageInfo;
//更新锚信息表
PreparedStatement pstmAnchorUrl;
//临时标准网页的内存字节数组,长度自动增长
ByteArrayOutputStream baos =new ByteArrayOutputStream();
public void run()
{
System.out.println("Thread "+num+" begin...");
try
{
pstmWebPageInfo=con.prepareStatement("insert into WebPageInfo (docID,URL,PAGETEXT) values(?,?,?)");
pstmAnchorUrl=con.prepareStatement("insert into AnchorUrl(DOCID,URL,ANCHORURL) values(?,?,?)");
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
long startTime=Calendar.getInstance().getTimeInMillis();
//循环处理主体
while(true)
{
//退出判断
long endTime=Calendar.getInstance().getTimeInMillis();
if(endTime-startTime>60000)
{
System.out.println(num+"*****End*******");
break;
}
String url=null;
try
{
synchronized(lock)
{
url =(String)waitingPages.firstElement();
waitingPages.remove(url);
}
}
catch(Exception ex)
{
//System.out.println(ex.getMessage());
continue;
}
//开始处理
startTime=Calendar.getInstance().getTimeInMillis();
//如果有URL
if(url!=null)
{
//输出正在处理的线程及其状态
System.out.println("Thread "+num+" Working at "+url);
//打开网络连接
URL startUrl;
URLConnection urlConnection;
//获取网络输入流
InputStream is=null;
try
{
baos.reset();
startUrl= new URL(url);
urlConnection = startUrl.openConnection();
is = urlConnection.getInputStream();
int oneByte = is.read();
int readflag =0;
while(oneByte > 0)
{
if( readflag==0 && oneByte==32)
{
oneByte = is.read();
continue;
}
if( readflag==0 && oneByte!=60)
break;
else
readflag=1;
baos.write(oneByte);
oneByte = is.read();
}
if(readflag==1)
{
String webPageContent=baos.toString();
//存储网页到网页表
int docID=writeToDatabase(url,webPageContent);
//输出已经处理完毕的网页信息
System.out.println("done!docid="+docID+" url="+url);
//分析和存储网页链出信息
analyzeAnchor(docID,url,webPageContent);
}
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
try
{
is.close();
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
finally
{
System.gc();
}
}
}
}
//存储网页到网页表,返回网页索引标识号
public int writeToDatabase(String url, String content)
{
//生成新的网页索引标识号
int newDocID=0;
synchronized(docIDCount)
{
newDocID=docIDCount.intValue();
docIDCount=new Integer(newDocID+1);
}
//更新数据库表
try
{
pstmWebPageInfo.setInt(1,newDocID);
pstmWebPageInfo.setString(2,url);
pstmWebPageInfo.setString(3,content);
pstmWebPageInfo.executeUpdate();
}
catch (Exception ex)
{
System.out.println(ex.getMessage());
}
return newDocID;
}
//分析网页链出信息
public void analyzeAnchor(int docID, String url,String content)
{
//解析网页文本所需的地址偏移变量
int beginPosition=0;
int endPosition=0;
//网页的动态数组
ArrayList otherAnchor=new ArrayList();
//分析网页链出信息
try
{
while(true)
{
beginPosition=content.indexOf("href=\\"",endPosition);;
endPosition=content.indexOf("\\"",beginPosition+hrefLen);
if(beginPosition==-1 || endPosition==-1)
break;
//得到链出的URL
String oneUrl=content.substring(beginPosition+hrefLen,endPosition);
String finalStr="";
//有效的URL
if(oneUrl.trim().startsWith("http",0))
{
finalStr=oneUrl;
}
//确保不含有诸如mailto等其他类型的访问方式
else if(oneUrl.indexOf("://")==-1)
{
//处理直接根目录开头的URL
if(oneUrl.trim().startsWith("/",0))
{
//hrefLen+1为7,是http://的长度
int tempEndPos=url.indexOf("/",hrefLen+1);
finalStr=url.substring(0,tempEndPos)+oneUrl;
}
//处理其他类型的URL,如直接写文件名和子目录等,也包含..这种情况
else
{
finalStr=url.substring(0,url.lastIndexOf("/"))+"/"+oneUrl;
}
}
//消除/..的影响
while(true)
{
int pos=finalStr.indexOf("/../");
if(pos!=-1)
{
int prepos=finalStr.lastIndexOf("/",pos-1);
finalStr=finalStr.substring(0,prepos)+finalStr.substring(pos+3);
}
else
break;
}
otherAnchor.add(finalStr);
}
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
//存储网页链出信息
storeAnchor(docID,url,otherAnchor);
}
//存储网页链出信息
public void storeAnchor(int docID, String url,ArrayList anchors)
{
//更新数据库表
for(int i=0;i<anchors.size();i++)
{
String anchorURL=(String)anchors.get(i);
try
{
pstmAnchorUrl.setInt(1,docID);
pstmAnchorUrl.setString(2,url);
pstmAnchorUrl.setString(3,anchorURL);
pstmAnchorUrl.executeUpdate();
}
catch (Exception ex)
{
System.out.println(ex.getMessage());
}
//在内存存储向量中添加需要处理的网页和已经处理的网页
try
{
String urlMd5=MD5(anchorURL);
synchronized(lock)
{
if(!completePages.contains(urlMd5))
{
//if( anchorURL.indexOf("njue.edu.cn")!=-1 || anchorURL.indexOf("//210")!=-1 ||anchorURL.indexOf("//211")!=-1)
{
waitingPages.addElement(anchorURL);
completePages.put(urlMd5,urlMd5);
}
}
}
}
catch(Exception ex)
{
System.out.println(ex.getMessage());
}
}
}
};
//启动线程匿名类
new Thread(spiderthread).start();
}
}
//计算MD5值的函数
public String MD5(String s)
{
char hexDigits[] ={ \'0\', \'1\', \'2\', \'3\', \'4\', \'5\', \'6\', \'7\', \'8\', \'9\', \'a\', \'b\', \'c\', \'d\', \'e\', \'f\' };
try
{
byte[] strTemp = s.getBytes();
java.security.MessageDigest mdTemp = java.security.MessageDigest.getInstance("MD5");
mdTemp.update(strTemp);
byte[] md = mdTemp.digest();
int j = md.length;
char str[] = new char[j * 2];
int k = 0;
for (int i = 0; i < j; i++)
{
byte byte0 = md[i];
str[k++] = hexDigits[byte0 >>> 4 & 0xf];
str[k++] = hexDigits[byte0 & 0xf];
}
return new String(str);
}
catch (Exception e)
{
return null;
}
}
public static void main(String args[])
{
//Spider sp=new Spider("http://localhost:8088/",10);
//Spider sp=new Spider("http://localhost/temp/a.html",10);
//Spider sp=new Spider("http://www.njue.edu.cn/",10);
Spider sp=new Spider("http://localhost/temp/a.html",10);
}
}