转(柳永法)
ASP/Visual Basic代码
- '/*=========================================================================
- ' * Intro 研究网页编码很长时间了,因为最近要设计一个友情链接检测的VBS脚本,而与你链接的人的页面很可能是各种编码,以前采取的方法是:如果用GB2312查不到再用UTF-8查,再找不到证明对方没有给你做链接虽然不是100%正确,但也差不多了,这两种编码用的人比较多,偶然间在收藏夹里的一个地址看到的一个思路,终于可以在采集文章时自动判断网页的编码了。因为研究过程中这个问题困扰很久,虽然现在觉得简单了,想必很多人还在找,所以把这三个函数贴出来。
- ' * FileName GetWebCodePage.vbs
- ' * Author yongfa365
- ' * Version v2.0
- ' * WEB http://www.yongfa365.com
- ' * Email yongfa365[at]qq.com
- ' * FirstWrite http://www.yongfa365.com/Item/GetWebCodePage.vbs.html
- ' * MadeTime 2008-01-29 20:55:46
- ' * LastModify 2008-01-30 20:55:46
- ' *==========================================================================*/
- Call getHTTPPage("http://www.baidu.com/")
- Call getHTTPPage("http://www.google.com/")
- Call getHTTPPage("http://www.yongfa365.com/")
- Call getHTTPPage("http://www.cbdcn.com/")
- Call getHTTPPage("http://www.csdn.net/")
- '得到匹配的内容,返回数组
- 'getContents(表达式,字符串,是否返回引用值)
- 'msgbox getContents("a(.+?)b", "a23234b ab a67896896b sadfasdfb" ,True)(0)
- Function getContents(patrn, strng , yinyong)
- 'by www.yongfa365.com 转载请保留链接,以便最终用户及时得到最新更新信息
- On Error Resume Next
- Set re = New RegExp
- re.Pattern = patrn
- re.IgnoreCase = True
- re.Global = True
- Set Matches = re.Execute(strng)
- If yinyong Then
- For i = 0 To Matches.Count -1
- If Matches(i).Value<>"" Then RetStr = RetStr & Matches(i).SubMatches(0) & "柳永法"
- Next
- Else
- For Each oMatch in Matches
- If oMatch.Value<>"" Then RetStr = RetStr & oMatch.Value & "柳永法"
- Next
- End If
- getContents = Split(RetStr, "柳永法")
- End Function
- Function getHTTPPage(url)
- On Error Resume Next
- Set xmlhttp = CreateObject("MSXML2.XMLHTTP")
- xmlhttp.Open "Get", url, False
- xmlhttp.Send
- If xmlhttp.Status<>200 Then Exit Function
- GetBody = xmlhttp.ResponseBody
- '柳永法(www.yongfa365.com)在此的思路是,先根据返回的字符串找,找文件头,如果还没有的话就用GB2312,一般都能直接匹配出编码。
- '在返回的字符串里看,虽然中文是乱码,但不影响我们取其编码,
- GetCodePage = getContents("charset=[""']*([^"",']+)", xmlhttp.ResponseText , True)(0)
- '在头文件里看编码
- If Len(GetCodePage)<3 Then GetCodePage = getContents("charset=[""']*([^"",']+)", xmlhttp.getResponseHeader("Content-Type") , True)(0)
- If Len(GetCodePage)<3 Then GetCodePage = "gb2312"
- Set xmlhttp = Nothing
- '下边这句在正式使用时要屏蔽掉
- WScript.Echo url & "-->" & GetCodePage
- getHTTPPage = BytesToBstr(GetBody, GetCodePage)
- End Function
- Function BytesToBstr(Body, Cset)
- On Error Resume Next
- Dim objstream
- Set objstream = CreateObject("adodb.stream")
- objstream.Type = 1
- objstream.Mode = 3
- objstream.Open
- objstream.Write Body
- objstream.Position = 0
- objstream.Type = 2
- objstream.Charset = Cset
- BytesToBstr = objstream.ReadText
- objstream.Close
- Set objstream = Nothing
- End Function
- '/*=========================================================================
- ' * Intro 研究网页编码很长时间了,因为最近要设计一个友情链接检测的VBS脚本,而与你链接的人的页面很可能是各种编码,以前采取的方法是:如果用GB2312查不到再用UTF-8查,再找不到证明对方没有给你做链接虽然不是100%正确,但也差不多了,这两种编码用的人比较多,偶然间在收藏夹里的一个地址看到的一个思路,终于可以在采集文章时自动判断网页的编码了。因为研究过程中这个问题困扰很久,虽然现在觉得简单了,想必很多人还在找,所以把这三个函数贴出来。
- ' * FileName GetWebCodePage.vbs
- ' * Author yongfa365
- ' * Version v2.0
- ' * WEB http://www.yongfa365.com
- ' * Email yongfa365[at]qq.com
- ' * FirstWrite http://www.yongfa365.com/Item/GetWebCodePage.vbs.html
- ' * MadeTime 2008-01-29 20:55:46
- ' * LastModify 2008-01-30 20:55:46
- ' *==========================================================================*/
- Call getHTTPPage("http://www.baidu.com/")
- Call getHTTPPage("http://www.google.com/")
- Call getHTTPPage("http://www.yongfa365.com/")
- Call getHTTPPage("http://www.cbdcn.com/")
- Call getHTTPPage("http://www.csdn.net/")
- '得到匹配的内容,返回数组
- 'getContents(表达式,字符串,是否返回引用值)
- 'msgbox getContents("a(.+?)b", "a23234b ab a67896896b sadfasdfb" ,True)(0)
- Function getContents(patrn, strng , yinyong)
- 'by www.yongfa365.com 转载请保留链接,以便最终用户及时得到最新更新信息
- On Error Resume Next
- Set re = New RegExp
- re.Pattern = patrn
- re.IgnoreCase = True
- re.Global = True
- Set Matches = re.Execute(strng)
- If yinyong Then
- For i = 0 To Matches.Count -1
- If Matches(i).Value<>"" Then RetStr = RetStr & Matches(i).SubMatches(0) & "柳永法"
- Next
- Else
- For Each oMatch in Matches
- If oMatch.Value<>"" Then RetStr = RetStr & oMatch.Value & "柳永法"
- Next
- End If
- getContents = Split(RetStr, "柳永法")
- End Function
- Function getHTTPPage(url)
- On Error Resume Next
- Set xmlhttp = CreateObject("MSXML2.XMLHTTP")
- xmlhttp.Open "Get", url, False
- xmlhttp.Send
- If xmlhttp.Status<>200 Then Exit Function
- GetBody = xmlhttp.ResponseBody
- '柳永法(www.yongfa365.com)在此的思路是,先根据返回的字符串找,找文件头,如果还没有的话就用GB2312,一般都能直接匹配出编码。
- '在返回的字符串里看,虽然中文是乱码,但不影响我们取其编码,
- GetCodePage = getContents("charset=[""']*([^"",']+)", xmlhttp.ResponseText , True)(0)
- '在头文件里看编码
- If Len(GetCodePage)<3 Then GetCodePage = getContents("charset=[""']*([^"",']+)", xmlhttp.getResponseHeader("Content-Type") , True)(0)
- If Len(GetCodePage)<3 Then GetCodePage = "gb2312"
- Set xmlhttp = Nothing
- '下边这句在正式使用时要屏蔽掉
- WScript.Echo url & "-->" & GetCodePage
- getHTTPPage = BytesToBstr(GetBody, GetCodePage)
- End Function
- Function BytesToBstr(Body, Cset)
- On Error Resume Next
- Dim objstream
- Set objstream = CreateObject("adodb.stream")
- objstream.Type = 1
- objstream.Mode = 3
- objstream.Open
- objstream.Write Body
- objstream.Position = 0
- objstream.Type = 2
- objstream.Charset = Cset
- BytesToBstr = objstream.ReadText
- objstream.Close
- Set objstream = Nothing
- End Function
