diff --git a/README.md b/README.md index e6166a9..3f5328c 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,22 @@ Now trying to read it We've found a new hyperlink with ref http://google.com and the text google End of main ``` - +You can also increase the log level and just dump a specific file. See [getstructure/main](getstructure/main.go) +``` +$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 +I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] +I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] +I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] +I0511 12:37:40.899369 18466 main.go:31] There is a new paragraph [...] + We've found a new run with the text ->test + We've found a new run with the text ->test font size + We've found a new run with the text ->test color +I0511 12:37:40.899389 18466 main.go:31] There is a new paragraph [...] + We've found a new run with the text ->test font size and color +I0511 12:37:40.899396 18466 main.go:31] There is a new paragraph [...] + We've found a new hyperlink with ref http://google.com and the text google +End of main +``` ### Build ``` diff --git a/getstructure/main.go b/getstructure/main.go new file mode 100644 index 0000000..b1e5cec --- /dev/null +++ b/getstructure/main.go @@ -0,0 +1,50 @@ +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/golang/glog" + "github.com/gonfva/docxlib" +) + +const FILE_PATH = "/tmp/new-file.docx" + +func main() { + flag.Parse() + //Now let's try to read the file + readFile, err := os.Open(FILE_PATH) + if err != nil { + panic(err) + } + fileinfo, err := readFile.Stat() + if err != nil { + panic(err) + } + size := fileinfo.Size() + doc, err := docxlib.Parse(readFile, int64(size)) + if err != nil { + panic(err) + } + for _, para := range doc.Paragraphs() { + glog.Infoln("There is a new paragraph", para) + for _, child := range para.Children() { + if child.Run != nil { + fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + } + if child.Link != nil { + id := child.Link.ID + text := child.Link.Run.InstrText + link, err := doc.References(id) + if err != nil { + fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) + } else { + fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) + } + + } + } + } + fmt.Println("End of main") +} diff --git a/go.mod b/go.mod index c416266..588b084 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/gonfva/docxlib go 1.16 + +require github.com/golang/glog v0.0.0-20210429001901-424d2337a529 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..41b530d --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/golang/glog v0.0.0-20210429001901-424d2337a529 h1:2voWjNECnrZRbfwXxHB1/j8wa6xdKn85B5NzgVL/pTU= +github.com/golang/glog v0.0.0-20210429001901-424d2337a529/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= diff --git a/pack.go b/pack.go index d513fc6..5569a86 100644 --- a/pack.go +++ b/pack.go @@ -3,7 +3,8 @@ package docxlib import ( "archive/zip" "encoding/xml" - "fmt" + + "github.com/golang/glog" ) // This receives a zip file writer (word documents are a zip with multiple xml inside) @@ -45,7 +46,7 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { func marshal(data interface{}) (out string, err error) { body, err := xml.Marshal(data) if err != nil { - fmt.Println(err) + glog.Errorln("Error marshalling", err) return } diff --git a/structdoc_test.go b/structdoc_test.go new file mode 100644 index 0000000..13a684e --- /dev/null +++ b/structdoc_test.go @@ -0,0 +1,31 @@ +package docxlib + +import ( + "encoding/xml" + "testing" +) + +const decoded_doc = `testtest font sizetest colorNew style 1New style 2test font size and colorgoogle` +const NUM_PARAGRAPHS = 5 + +func TestStructure(t *testing.T) { + doc := Document{ + XMLW: XMLNS_W, + XMLR: XMLNS_R, + XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} + err := xml.Unmarshal([]byte(decoded_doc), &doc) + if err != nil { + t.Errorf("We expected to be able to decode %s but we didn't", + decoded_doc) + } + if len(doc.Body.Paragraphs) != NUM_PARAGRAPHS { + t.Errorf("We expected %d paragraph, we got %d", + NUM_PARAGRAPHS, len(doc.Body.Paragraphs)) + } + for _, p := range doc.Body.Paragraphs { + if len(p.Children()) == 0 { + t.Errorf("We were not able to parse paragraph %v", + p) + } + } +} diff --git a/structnodes.go b/structnodes.go index f5d16ee..d9c9ebd 100644 --- a/structnodes.go +++ b/structnodes.go @@ -3,8 +3,9 @@ package docxlib import "encoding/xml" type ParagraphChild struct { - Link *Hyperlink `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink"` - Run *Run `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r"` + Link *Hyperlink `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"` + Run *Run `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"` + Properties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` } type Paragraph struct { diff --git a/structrun.go b/structrun.go index f145165..e2c1bae 100644 --- a/structrun.go +++ b/structrun.go @@ -9,7 +9,7 @@ const ( // A Run is part of a paragraph that has its own style. It could be // a piece of text in bold, or a link type Run struct { - XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r"` + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"` RunProperties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` InstrText string `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main instrText,omitempty"` Text *Text @@ -24,22 +24,29 @@ type Text struct { // The hyperlink element contains links type Hyperlink struct { - XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink"` + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"` ID string `xml:"http://schemas.openxmlformats.org/officeDocument/2006/relationships id,attr"` Run Run } // RunProperties encapsulates visual properties of a run type RunProperties struct { - XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr"` + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` Color *Color `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color,omitempty"` Size *Size `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz,omitempty"` RunStyle *RunStyle `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"` + Style *Style `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"` } // RunStyle contains styling for a run type RunStyle struct { - XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle"` + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"` + Val string `xml:"w:val,attr"` +} + +// Style contains styling for a paragraph +type Style struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"` Val string `xml:"w:val,attr"` } diff --git a/unpack.go b/unpack.go index b97a641..ba04311 100644 --- a/unpack.go +++ b/unpack.go @@ -4,8 +4,9 @@ package docxlib import ( "archive/zip" "encoding/xml" - "fmt" "io/ioutil" + + "github.com/golang/glog" ) // This receives a zip file (word documents are a zip with multiple xml inside) @@ -40,19 +41,21 @@ func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { func processDoc(file *zip.File) (*Document, error) { filebytes, err := readZipFile(file) if err != nil { - fmt.Println("Error reading from internal zip file") + glog.Errorln("Error reading from internal zip file") return nil, err } + glog.V(0).Infoln("Doc:", string(filebytes)) + doc := Document{ XMLW: XMLNS_W, XMLR: XMLNS_R, XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} err = xml.Unmarshal(filebytes, &doc) if err != nil { - fmt.Println("Error unmarshalling doc") - fmt.Println(string(filebytes)) + glog.Errorln("Error unmarshalling doc", string(filebytes)) return nil, err } + glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs) return &doc, nil } @@ -60,13 +63,15 @@ func processDoc(file *zip.File) (*Document, error) { func processRelations(file *zip.File) (*Relationships, error) { filebytes, err := readZipFile(file) if err != nil { - fmt.Println("Error reading from internal zip file") + glog.Errorln("Error reading from internal zip file") return nil, err } + glog.V(0).Infoln("Relations:", string(filebytes)) + rels := Relationships{Xmlns: XMLNS_R} err = xml.Unmarshal(filebytes, &rels) if err != nil { - fmt.Println("Error unmarshalling relationships") + glog.Errorln("Error unmarshalling relationships") return nil, err } return &rels, nil