web-dev-qa-db-ja.com

VBAを使用してHTMLテーブルをExcelに変換する

HTMLテーブルをExcelに変換

次のコードは、 https://rasmusrhl.github.io/stuff のHTMLテーブルを取得し、Excel形式に変換します。

問題はそれです:

  • 括弧内の数値は負の数値に変換されます
  • 数値は四捨五入または切り捨てられます

ソリューション

多大なご協力ありがとうございました。さまざまな答えは、私の目的にとっては回避策が最善の解決策であるということを理解するのに役立ちました。HTMLテーブルを自分で生成するため、各セルのCSSを制御できます。セルの内容を解釈する方法をExcelに指示するCSSコードがあります: http://cosicimiento.blogspot.dk/2008/11/styling-Excel-cells-with-mso-number.html この質問: ExcelがテキストとしてフォーマットされるようにHTMLテーブルセルをフォーマットしますか?

私の場合、CSSはmso-number-format:\"\\@\"。以下のRコードに統合されています。

library(htmlTable)
library(nycflights13)
library(dplyr)

nycflights13::planes %>% 
    slice(1:10) %>% mutate( seats = seats*1.0001,
                            s1    = c("1-5", "5-10", "1/2", "1/10", "2-3", "1", "1.0", "01", "01.00", "asfdkjlæ" ),
                            s2    = c("(10)", "(12)", "(234)", "(00)", "(01)", "(098)", "(01)", "(01.)", "(001.0)", "()" )) -> df 


rle_man <- rle(df$manufacturer)

css_matrix <- matrix( data = "mso-number-format:\"\\@\"", nrow = nrow(df), ncol = ncol(df))
css_matrix[,1] <- "padding-left: 0.4cm;mso-number-format:\"\\@\""
css_matrix[,2:10] <- "padding-left: 1cm;mso-number-format:\"\\@\""
css_matrix[,5] <- "padding-left: 2cm;mso-number-format:\"\\@\""


htmlTable( x = df,  
           rgroup   = rle_man$values, n.rgroup = rle_man$lengths, 
           rnames   = FALSE, align = c("l", "r" ), 
           cgroup   =  rbind(  c("", "Some text goes here. It is long and does not break", "Other text goes here", NA),
                               c( "", "Machine type<br>(make)", "Specification of machine", "Other variables")),
           n.cgroup = rbind(   c(1,8,2, NA),
                               c(1, 3, 5, 2)), 
           css.cell = css_matrix )            -> html_out

temp_file <- tempfile( pattern = "table", fileext = ".html" )
readr::write_file( x = html_out, path = temp_file)
utils::browseURL( temp_file)

そのHTMLファイルをExcelにドラッグアンドドロップすると、すべてのセルがテキストとして解釈されます。 html-fileをExcelにドラッグアンドドロップするだけで機能し、ブラウザーでテーブルを開いてExcelにコピーアンドペーストすることはできません。

この方法で欠けているのは水平線だけですが、私はそれに耐えることができます。

以下は、ドラッグアンドドロップと同じ効果を持つVBAです。

Sub importhtml()
'
' importhtml Macro
'

'
With ActiveSheet.QueryTables.Add(Connection:= _
                                 "URL;file:///C:/Users/INSERTUSERNAME/Desktop/table18b85c0a20f3html.HTML", Destination:=Range("$a$1"))

.FieldNames = True
.RowNumbers = False
.FillAdjacentFormulas = False
.PreserveFormatting = False
.RefreshOnFileOpen = False
.BackgroundQuery = True
.RefreshStyle = xlInsertDeleteCells
.SavePassword = False
.SaveData = True
.AdjustColumnWidth = True
.RefreshPeriod = 0
.WebSelectionType = xlEntirePage
.WebFormatting = xlWebFormattingAll
.WebPreFormattedTextToColumns = True
.WebConsecutiveDelimitersAsOne = True
.WebSingleBlockTextImport = False
.WebDisableDateRecognition = True
.WebDisableRedirections = False
.Refresh BackgroundQuery:=False
End With

End Sub
29
Rasmus Larsen

クライアント側のソリューションの場合

したがって、コードの最初のブロックの後にこのコードを実行すると、最後の2列が書き換えられます。

Sub Test2()
    '* tools references ->
    '*   Microsoft HTML Object Library


    Dim oHtml4 As MSHTML.IHTMLDocument4
    Set oHtml4 = New MSHTML.HTMLDocument

    Dim oHtml As MSHTML.HTMLDocument
    Set oHtml = Nothing

    '* IHTMLDocument4.createDocumentFromUrl
    '* MSDN - IHTMLDocument4 createDocumentFromUrl method - https://msdn.Microsoft.com/en-us/library/aa752523(v=vs.85).aspx
    Set oHtml = oHtml4.createDocumentFromUrl("https://rasmusrhl.github.io/stuff/", "")
    While oHtml.readyState <> "complete"
        DoEvents  '* do not comment this out it is required to break into the code if in infinite loop
    Wend
    Debug.Assert oHtml.readyState = "complete"


    Dim oTRs As MSHTML.IHTMLDOMChildrenCollection
    Set oTRs = oHtml.querySelectorAll("TR")
    Debug.Assert oTRs.Length = 17

    Dim lRowNum As Long
    For lRowNum = 3 To oTRs.Length - 1

        Dim oTRLoop As MSHTML.HTMLTableRow
        Set oTRLoop = oTRs.Item(lRowNum)
        If oTRLoop.ChildNodes.Length > 1 Then

            Debug.Assert oTRLoop.ChildNodes.Length = 14

            Dim oSecondToLastColumn As MSHTML.HTMLTableCell
            Set oSecondToLastColumn = oTRLoop.ChildNodes.Item(12)

            ActiveSheet.Cells(lRowNum + 2, 13).Value2 = "'" & oSecondToLastColumn.innerText


            Dim oLastColumn As MSHTML.HTMLTableCell
            Set oLastColumn = oTRLoop.ChildNodes.Item(13)

            ActiveSheet.Cells(lRowNum + 2, 14).Value2 = "'" & oLastColumn.innerText

        End If
        'Stop

    Next lRowNum

    ActiveSheet.Columns("M:M").EntireColumn.AutoFit
    ActiveSheet.Columns("N:N").EntireColumn.AutoFit


End Sub

サーバー側ソリューションの場合

ソーススクリプトを制御し、それがRにあることがわかったので、mso-number-format: '\ @'で最終列をスタイルするようにRスクリプトを変更できます。これを実現するサンプルRスクリプトを次に示します。データと同じ次元のCSSマトリックスを作成し、CSSマトリックスをパラメーターとしてhtmlTableに渡します。代わりに、Rソースを改ざんしたことはありませんが、ここで解釈するための簡単な例を示します。

A=matrix(c("(2)","(4)","(3)","(1)","(5)","(7)"),nrow=2,ncol=3,byrow=TRUE)
css_matrix <- matrix(data="",nrow=2,ncol=3)
css_matrix[,3] <- "mso-number-format:\"\\@\""
htmlTable(x=A,css.cell=css_matrix)

Excelで開くと、これが表示されます enter image description here

ロビンマッケンジー 追加

サーバー側のソリューションで、OPがcss_matrix [、10:11] <-"mso-number-format:\"\@\""を既存のRコードに追加するだけでよいことに言及するかもしれません(最後のcss_matrixの後)。 。line)そして、特定の問題に対するソリューションを実装します

ありがとうロビン

8
S Meaden

そのページから表形式のデータを取得するには(形式をそのままに)、次のように試すことができます。

 Sub Fetch_Data()
    Dim http As New XMLHTTP60, html As New HTMLDocument
    Dim posts As Object, post As Object, elem As Object
    Dim row As Long, col As Long

    With http
        .Open "GET", "https://rasmusrhl.github.io/stuff/", False
        .send
        html.body.innerHTML = .responseText
    End With

    Set posts = html.getElementsByClassName("gmisc_table")(0)

    For Each post In posts.Rows
        For Each elem In post.Cells
            col = col + 1: Cells(row + 1, col).NumberFormat = "@": Cells(row + 1, col) = elem.innerText
        Next elem
        col = 0
        row = row + 1
    Next post
End Sub

ライブラリに追加する参照:

1. Microsoft HTML Object Library
2. Microsoft XML, v6.0  'or whatever version you have

これは、解析されたときにその部分がどのように見えるかです。 enter image description here

6
SIM

目的の出力が得られるかどうか試してみてください...

Sub GetWebData()
Dim IE As Object
Dim doc As Object
Dim TRs As Object
Dim TR As Object
Dim Cell As Object
Dim r As Long, c As Long

Application.ScreenUpdating = False

Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = False
IE.navigate "https://rasmusrhl.github.io/stuff/"
Do While IE.Busy Or IE.readyState <> 4
    DoEvents
Loop
Set doc = IE.document

Set TRs = doc.getElementsByTagName("tr")
Cells.Clear

For Each TR In TRs
    r = r + 1
    For Each Cell In TR.Children
        c = c + 1
        Cells(r, c).NumberFormat = "@"
        Cells(r, c) = Cell.innerText
    Next Cell
    c = 0
Next TR
IE.Quit
Columns.AutoFit
Application.ScreenUpdating = True
End Sub

解決策2:

それを機能させるには、ツール(VBAエディター)->参照に移動して次の2つの参照を追加し、次に以下の2つの参照を見つけ、それらのチェックボックスをオンにして[OK]をクリックする必要があります。

1)Microsoft XML v6.0(利用可能な最大バージョンを見つける)

2)Microsoft HTML Object Library

Sub GetWebData2()
Dim XMLpage As New MSXML2.XMLHTTP60
Dim doc As New MSHTML.HTMLDocument
Dim TRs As IHTMLElementCollection
Dim TR As IHTMLElement
Dim Cell As IHTMLElement
Dim r As Long, c As Long

Application.ScreenUpdating = False

Set XMLpage = CreateObject("MSXML2.XMLHTTP")

XMLpage.Open "GET", "https://rasmusrhl.github.io/stuff/", False
XMLpage.send
doc.body.innerhtml = XMLpage.responsetext
Set TRs = doc.getElementsByTagName("tr")
Set TRs = doc.getElementsByTagName("tr")
Cells.Clear

For Each TR In TRs
    r = r + 1
    For Each Cell In TR.Children
        c = c + 1
        Cells(r, c).NumberFormat = "@"
        Cells(r, c) = Cell.innerText
    Next Cell
    c = 0
Next TR
Columns.AutoFit
Application.ScreenUpdating = True
End Sub
<style type=text/css>
    td {mso-number-format: '\@';}
</style>
<table ...

セルに上記のグローバルスタイル定義を配置する(<td>s)R orを使用して生成した出力では、クライアント側でドキュメントを次のように書き換えます。

Sub importhtml()
    '*********** HTML document rewrite process ***************
    Const TableUrl = "https://rasmusrhl.github.io/stuff"

    Const adTypeBinary = 1, adSaveCreateOverWrite = 2, TemporaryFolder = 2
    Dim tempFilePath, binData() As Byte

    With CreateObject("Scripting.FileSystemObject")
        tempFilePath = .BuildPath(.GetSpecialFolder(TemporaryFolder), .GetTempName() & ".html")
    End With

    'download HTML document
    With CreateObject("MSXML2.ServerXMLHTTP")
        .Open "GET", TableUrl, False
        .Send
        If .Status <> 200 Then Err.Raise 3, "importhtml", "200 expected"
        binData = .ResponseBody
    End With

    With CreateObject("Adodb.Stream")
        .Charset = "x-ansi"
        .Open
        .WriteText "<style type=text/css>td {mso-number-format:'\@';}</style>"
        .Position = 0 'move to start
        .Type = adTypeBinary 'change stream type
        .Position = .Size 'move to end
        .Write binData 'append binary data end of stream
        .SaveToFile tempFilePath, adSaveCreateOverWrite 'save temporary file
        .Close
    End With
    '*********** HTML document rewrite process ***************

    With ActiveSheet.QueryTables.Add(Connection:= _
        "URL;" & tempFilePath, Destination:=Range("$A$1"))
        'load HTML document from rewritten local copy

        .Name = "stuff"
        .FieldNames = True
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = False
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlEntirePage
        .WebFormatting = xlWebFormattingAll
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = True
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False

    End With

    Kill tempFilePath
End Sub
4
Kul-Tigin

これは一時ファイルで機能します。

機能:ローカルにデータをダウンロードします。次に、「(」を「\」に置き換えます。次に、データをインポートします。データをテキストとしてフォーマットします(エラーなく元に戻せるようにするため)。次に、テキストを変更します。これはRange.Replaceではできません。セルの内容が再フォーマットされるためです。

' Local Variables
Public FileName As String ' Temp File Path
Public FileUrl As String ' Url Formatted Temp File Path
Public DownloadUrl As String ' Where We're Going to Download From

' Declares Have to Be At Top
Private Declare Function GetTempPath Lib "kernel32" _
  Alias "GetTempPathA" _
  (ByVal nBufferLength As Long, _
  ByVal lpBuffer As String) As Long
Private Declare Function GetTempFileName Lib "kernel32" _
  Alias "GetTempFileNameA" _
  (ByVal lpszPath As String, _
  ByVal lpPrefixString As String, _
  ByVal wUnique As Long, _
  ByVal lpTempFileName As String) As Long

' Loads the HTML Content Without Bug
Sub ImportHtml()

    ' Set Our Download URL
    DownloadUrl = "https://rasmusrhl.github.io/stuff"

    ' Sets the Temporary File Path
    SetFilePath

    ' Downloads the File
    DownloadFile

    ' Replaces the "(" in the File With "\(", We Will Later Put it Back
    ' This Ensures Formatting of Content Isn't Modified!!!
    ReplaceStringInFile


    ' Our Query Table is Now Coming From the Local File, Instead
    Dim s As QueryTable
    Set s = ActiveSheet.QueryTables.Add(Connection:=("Finder;file://" + FileUrl), Destination:=Range("$A$1"))

    With s

        .Name = "stuff"
        .FieldNames = True
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = False
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlEntirePage
        .WebFormatting = xlWebFormattingAll
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = True
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False

        ' Sets Formatting So When We Change Text the Data Doesn't Change
        .ResultRange.NumberFormat = "@"

        ' Loop Through Cells in Range
        ' If You Do Excel Replace, Instead It Will Change Cell Format
        Const myStr As String = "\(", myReplace As String = "("
        For Each c In .ResultRange.Cells
            Do While c.Value Like "*" & myStr & "*"
                c.Characters(InStr(1, c.Value, myStr), Len(myStr)).Text = myReplace
            Loop
        Next


    End With
End Sub

' This function replaces the "(" in the file with "\("
Sub ReplaceStringInFile()

    Dim sBuf As String
    Dim sTemp As String
    Dim iFileNum As Integer
    Dim sFileName As String

    ' Edit as needed
    sFileName = FileName

    iFileNum = FreeFile
    Open sFileName For Input As iFileNum

    Do Until EOF(iFileNum)
        Line Input #iFileNum, sBuf
        sTemp = sTemp & sBuf & vbCrLf
    Loop
    Close iFileNum

    sTemp = Replace(sTemp, "(", "\(")

    iFileNum = FreeFile
    Open sFileName For Output As iFileNum
    Print #iFileNum, sTemp
    Close iFileNum

End Sub

' This function sets file paths because we need a temp file
Function SetFilePath()

    If FileName = "" Then
        FileName = GetTempHtmlName
        FileUrl = Replace(FileName, "\", "/")
    End If

End Function

' This subroutine downloads the file from the specified URL
' The download is necessary because we will be editing the file
Sub DownloadFile()

    Dim myURL As String
    myURL = "https://rasmusrhl.github.io/stuff"

    Dim WinHttpReq As Object
    Set WinHttpReq = CreateObject("Microsoft.XMLHTTP")
    WinHttpReq.Open "GET", DownloadUrl, False, "username", "password"
    WinHttpReq.send

    myURL = WinHttpReq.responseBody
    If WinHttpReq.Status = 200 Then
        Set oStream = CreateObject("ADODB.Stream")
        oStream.Open
        oStream.Type = 1
        oStream.Write WinHttpReq.responseBody
        oStream.SaveToFile FileName, 2 ' 1 = no overwrite, 2 = overwrite
        oStream.Close
    End If

End Sub

'''''''''''''''''''''''''''''
' THIS BLOCK OF CODE GETS A TEMPORARY FILE PATH USING THE GetTempHtmlName Function
'''''''''''''''''''''''''''''


Public Function GetTempHtmlName( _
  Optional sPrefix As String = "VBA", _
  Optional sExtensao As String = "") As String
  Dim sTmpPath As String * 512
  Dim sTmpName As String * 576
  Dim nRet As Long
  Dim F As String
  nRet = GetTempPath(512, sTmpPath)
  If (nRet > 0 And nRet < 512) Then
    nRet = GetTempFileName(sTmpPath, sPrefix, 0, sTmpName)
    If nRet <> 0 Then F = Left$(sTmpName, InStr(sTmpName, vbNullChar) - 1)
    If sExtensao > "" Then
      Kill F
      If Right(F, 4) = ".tmp" Then F = Left(F, Len(F) - 4)
      F = F & sExtensao
    End If
    F = Replace(F, ".tmp", ".html")
    GetTempHtmlName = F
  End If
End Function

'''''''''''''''''''''''''''''
' End - GetTempHtmlName
'''''''''''''''''''''''''''''
4
Ctznkane525

HTMLを処理してからExcelにコピーして貼り付けると、

使用した手順は次のとおりです。

  • CreateObject("MSXML2.XMLHTTP"):URLのresponseTextを取得
  • CreateObject("HTMLFile"):responseTextからHTMLドキュメントを作成します
  • グレーを黒に置き換えて境界線を暗くします
  • 列s1およびs2に接頭辞@を付けてフォーマットを保持
  • HTMLをWindowsクリップボードにコピーします
    • 注:HTMLを適切に貼り付けるには、HTMLタグとBodyタグで囲む必要があります
  • 宛先ワークシートのセットアップ
  • HTMLをワークシートに貼り付けます
  • @記号を' に置き換えます。
    • 注:これは、データをテキストとして保存することでフォーマットを保持します
  • ワークシートのフォーマットを完了する

enter image description here


Sub LoadTable()
    Const URL = "https://rasmusrhl.github.io/stuff/"
    Dim x As Long
    Dim doc As Object, tbl As Object, rw As Object
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        If .readyState = 4 And .Status = 200 Then
            Set doc = CreateObject("HTMLFile")
            doc.body.innerHTML = .responseText
            doc.body.innerHTML = Replace(doc.body.innerHTML, "grey", "black")
            Set tbl = doc.getElementsByTagName("TABLE")(0)

            For x = 0 To tbl.Rows.Length - 1
                Set rw = tbl.Rows(x)

                If rw.Cells.Length = 14 Then
                    'If InStr(rw.Cells(12).innerText, "-") Or InStr(rw.Cells(12).innerText, "/") Then
                    rw.Cells(12).innerText = "@" & rw.Cells(12).innerText
                    rw.Cells(13).innerText = "@" & rw.Cells(13).innerText
                End If
            Next

            With CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
                .SetText "<html><body>" & doc.body.innerHTML & "</body></html>"
                .PutInClipboard
            End With

            With Worksheets("Sheet1")
                .Cells.Clear
                .Range("A1").PasteSpecial
                .Cells.Interior.Color = vbWhite
                .Cells.WrapText = False
                .Columns.AutoFit
                .Columns("M:N").Replace What:="@", Replacement:="'"
            End With

        Else
            MsgBox "URL:  " & vbCrLf & "Ready state: " & .readyState & vbCrLf & "HTTP request status: " & .Status, vbInformation, "URL Not Responding"
        End If
    End With
End Sub
2
user6432984

これを試して、データをテーブルとしてインポートします。

Sub ImportDataAsTable()
    ActiveWorkbook.Queries.Add Name:="Table 0", Formula:= _
        "let" & Chr(13) & "" & Chr(10) & "    Source = Web.Page(Web.Contents(""https://rasmusrhl.github.io/stuff/""))," & Chr(13) & "" & Chr(10) & "    Data0 = Source{0}[Data]," & Chr(13) & "" & Chr(10) & "    #""Changed Type"" = Table.TransformColumnTypes(Data0,{{""tailnum"", type text}, {"""", type text}, {""Some text goes here. It is long and does not break Machine type (make) year"", type text}, {""Some text goes here. It is long and does not break Mach" & _
        "ine type (make) type"", type text}, {""Some text goes here. It is long and does not break Machine type (make) manufacturer"", type text}, {""Some text goes here. It is long and does not break"", type text}, {""Some text goes here. It is long and does not break Specification of machine model"", type text}, {""Some text goes here. It is long and does not break Specifi" & _
        "cation of machine engines"", type text}, {""Some text goes here. It is long and does not break Specification of machine seats"", type text}, {""Some text goes here. It is long and does not break Specification of machine speed"", type text}, {""Some text goes here. It is long and does not break Specification of machine engine"", type text}, {""2"", type text}, {""Oth" & _
        "er text goes here Other variables s1"", type text}, {""Other text goes here Other variables s2"", type text}})" & Chr(13) & "" & Chr(10) & "in" & Chr(13) & "" & Chr(10) & "    #""Changed Type"""
    ActiveWorkbook.Worksheets.Add
    With ActiveSheet.ListObjects.Add(SourceType:=0, Source:= _
        "OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=""Table 0"";Extended Properties=""""" _
        , Destination:=Range("$A$1")).QueryTable
        .CommandType = xlCmdSql
        .CommandText = Array("SELECT * FROM [Table 0]")
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .PreserveColumnInfo = True
        .ListObject.DisplayName = "Table_0"
        .Refresh BackgroundQuery:=False
    End With
End Sub
2
ashleedawg

Microsoft MSDNライブラリ:WebFormattingプロパティ のドキュメントに基づいて、以下のコード変更を試すことができます。

 .WebFormatting = xlWebFormattingNone

これにより、数値をフォーマットせずにデータをコピーできる場合があります。これらのセルに独自の数値フォーマットを設定できます( MSDN:Excel VBA NumberFormat property

同様の解決策は、数値が切り捨てられたり丸められたりする問題を解決するはずです-ターゲット範囲内の影響を受けるセルの小数点を設定します...

1