asp远程网页数据采集常用函数代码
时间:2008-10-31 12:44:00
下面列出了asp远程网页数据采集程序中经常用到的函数,很实用,特别是正则表达式过滤函数。包括了使用xmlhttp采集远程网页内容,使用adodb.stream转换编码,内容过滤匹配等等..
数据采集程序
On Error Resume Next
Server.Scripttimeout=300
'采集远程数据
Function getHTTPData(url)
dim http
set http=Server.createobject("Msxml2.XMLHTTP")
if instr(url,"http://")=0 then url="http://"&url
Http.open "GET",url,false
Http.send()
if Http.Status<>200 then exit function
getHTTPData=bytesToBSTR(Http.responseBody,"UTF-8")
set http=nothing
if err.number<>0 then err.Clear
sCharset=""
End function
'网页编码转换
Function BytesToBstr(body,Cset)
dim objstream
set objstream = Server.CreateObject("adodb.stream")
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset = Cset
BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function
'---------------------------------------------------------------------
'服务器登录
Function login(url)
dim http
set http=Server.createobject("Msxml2.XMLHTTP")
if instr(url,"http://")=0 then url="http://"&url
Http.open "GET",url,false
Http.send()
if Http.Status<>200 then exit function
set http=nothing
if err.number<>0 then err.Clear
End function
'---------------------------------------------------------------------
'正则替换
Function ReplaceText(fString,patrn, replStr)
Set regEx = New RegExp
regEx.Pattern = patrn
regEx.IgnoreCase = True
regEx.Global = True
ReplaceText = regEx.Replace(fString, replStr)
End Function
'---------------------------------------------------------------------
'去标签 包括内容
Function ReplaceTag(str, tag)
Set regEx = New RegExp
regEx.Pattern = "<"&tag&"[^>]*?>.*?<\/"&tag&">"
regEx.IgnoreCase = True
regEx.Global = True
ReplaceTag=regEx.Replace(str, "")
End Function
'---------------------------------------------------------------------
'去标签 不包括内容
Function ReplaceTab(str, tag)
Set regEx = New RegExp
regEx.Pattern = "<\/?"&tag&"[^>]*>"
regEx.IgnoreCase = True
regEx.Global = True
ReplaceTab=regEx.Replace(str, "")
End Function
'---------------------------------------------------------------------
'去标签属性 保留标签
Function ReplaceinnerTag(str, tag)
Set regEx = New RegExp
regEx.Pattern = "(<\/?"&tag&")[^>]*>"
regEx.IgnoreCase = True
regEx.Global = True
ReplaceinnerTag=regEx.Replace(str, "$1>")
End Function
'---------------------------------------------------------------------
'按正则取数据
Function getText(fString, patrn,n)
dim Matches, tStr
tStr = fString
Set re = New Regexp
re.IgnoreCase = True
re.Global = True
re.Pattern = patrn
set Matches = re.Execute(tStr)
set re = nothing
rStr = ""
For Each Match in Matches
rStr = Match.SubMatches(n)
exit for
Next
getText = rStr
End Function
'---------------------------------------------------------------------
'数据过滤
Function Encode_text(str)
If Isnull(str) Then
Encode_text = ""
Exit Function
End If
str = ReplaceText(str, "<\/?br[^>]*>" , vbCrlf )
str = ReplaceText(str, "<\/?p[^>]*>" , vbCrlf )
str = ReplaceTab(str, "[a-zA-Z]")
str = ReplaceText(str, "\n\s*\r" ,Chr(10)&Chr(13))
str = Replace(str, "&" , "&" )
str = Replace(str, ";" , ";" )
str = Replace(str, "&" , "&" )
str = Replace(str,Chr(34), """ )
str = Replace(str, "'" , "'" )
str = Replace(str, "<" , "<" )
str = Replace(str, ">" , ">" )
str = Replace(str, "(" , "(" )
str = Replace(str, ")" , ")" )
str = Replace(str, "*" , "*" )
str = Replace(str, "%" , "%" )
str = Replace(str,vbCrlf, "<br/>" )
Encode_text = str
End Function
'---------------------------------------------------------------------
'通过Matches取数据
dim Matches
sub setMatches(str,sRe)
Set re = New Regexp
re.IgnoreCase = True
re.Global = True
re.Pattern = sRe
set Matches = re.Execute(str)
set re=nothing
end sub
'例子
call setMatches(textcontent, re)
For Each Match in Matches
response.write Match.value
Next