From eae5f9038566be22fcaa74425f0971c4dd6a854e Mon Sep 17 00:00:00 2001 From: Gonzalo Fernandez-Victorio Date: Fri, 23 Apr 2021 16:58:31 +0100 Subject: [PATCH] First commit --- .gitignore | 1 + LICENSE | 23 +++ README.md | 61 ++++++++ constants.go | 389 ++++++++++++++++++++++++++++++++++++++++++++++ document.go | 20 +++ docx.go | 37 +++++ empty.go | 48 ++++++ go.mod | 3 + link.go | 38 +++++ main/main.go | 56 +++++++ pack.go | 51 ++++++ paragraph.go | 42 +++++ relationship.go | 24 +++ run.go | 58 +++++++ run_properties.go | 29 ++++ unpack.go | 77 +++++++++ 16 files changed, 957 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 constants.go create mode 100644 document.go create mode 100644 docx.go create mode 100644 empty.go create mode 100644 go.mod create mode 100644 link.go create mode 100644 main/main.go create mode 100644 pack.go create mode 100644 paragraph.go create mode 100644 relationship.go create mode 100644 run.go create mode 100644 run_properties.go create mode 100644 unpack.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ce4461a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +docxlib diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f4b2c27 --- /dev/null +++ b/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2020 gingfrederik +Copyright (c) 2021 Gonzalo Fernandez-Victorio +Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..54e4737 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +# Docx library + +Yet another library to manipulate .docx (Microsoft Word) files in Go. + +## Introduction + +As part of my work for [Basement Crowd](https://www.basementcrowd.com) y [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) + +The difference with other projects is the following: + +- [UniOffice](https://github.com/unidoc/unioffice) is probably the most complete but it is also commercial (you need to pay). It also very complete, but too much for my needs. + +- [gingfrederik/docx](https://github.com/gingfrederik/docx) only allows to write. + +There are also a couple of other projects [kingzbauer/docx](https://github.com/kingzbauer/docx) and [nguyenthenguyen/docx](https://github.com/nguyenthenguyen/docx) + +[gingfrederik/docx](https://github.com/gingfrederik/docx) was a heavy influence (the original structures and the main method come from that project). + +However, the structures didn't handle reading and extending them was particularly difficult due to Go xml parser being limited and [6 year old bug](https://github.com/golang/go/issues/9519). + +Additionally, my requirements go beyond the original structure and a hard fork seemed more sensible. + +The plan is to evolve the library, so the API is likely to change according to my company's needs. But please do feel free to send patches, reports and PRs or fork. + +In the mean time, shared as an example. + +## Getting Started + +### Install + +Go modules supported + +```sh +go get github.com/gonfva/docxlib +``` + +### Usage + +See [main](main/main.go) for an example + +``` +$ ./docxlib +Preparing new document to write at /tmp/new-file.docx +Document writen. +Now trying to read it + We've found a new run with the text ->test + We've found a new run with the text ->test font size + We've found a new run with the text ->test color + We've found a new run with the text ->test font size and color +End of main +``` + +### Build + +``` +$ go build -o docxlib ./main +``` + +## License + +MIT. See [LICENSE](LICENSE) diff --git a/constants.go b/constants.go new file mode 100644 index 0000000..ec8a94e --- /dev/null +++ b/constants.go @@ -0,0 +1,389 @@ +package docxlib + +const ( + TEMP_REL = ` + + + + + ` + TEMP_DOCPROPS_APP = `Go DOCX` + TEMP_DOCPROPS_CORE = `` + TEMP_CONTENT = ` + + + + + + + + + ` + TEMP_WORD_STYLE = ` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ` + TEMP_WORD_THEME_THEME = ` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ` +) diff --git a/document.go b/document.go new file mode 100644 index 0000000..5d71039 --- /dev/null +++ b/document.go @@ -0,0 +1,20 @@ +package docxlib + +import "encoding/xml" + +const ( + XMLNS_W = `http://schemas.openxmlformats.org/wordprocessingml/2006/main` + XMLNS_R = `http://schemas.openxmlformats.org/officeDocument/2006/relationships` +) + +type Body struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main body"` + Paragraphs []*Paragraph `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` +} + +type Document struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main document"` + XMLW string `xml:"xmlns:w,attr"` + XMLR string `xml:"xmlns:r,attr"` + Body *Body +} diff --git a/docx.go b/docx.go new file mode 100644 index 0000000..2aea587 --- /dev/null +++ b/docx.go @@ -0,0 +1,37 @@ +package docxlib + +import ( + "archive/zip" + "io" +) + +type Docx struct { + Document Document + DocRelation Relationships + + rId int +} + +// New generates a new empty docx file that we can manipulate and +// later on, save +func New() *Docx { + return emptyFile() +} + +// Parse generates a new docx file in memory from a reader +func Parse(reader io.ReaderAt, size int64) (doc *Docx, err error) { + zipReader, err := zip.NewReader(reader, size) + if err != nil { + return nil, err + } + doc, err = unpack(zipReader) + return +} + +// Write allows to save a docx to a writer +func (f *Docx) Write(writer io.Writer) (err error) { + zipWriter := zip.NewWriter(writer) + defer zipWriter.Close() + + return f.pack(zipWriter) +} diff --git a/empty.go b/empty.go new file mode 100644 index 0000000..65633c3 --- /dev/null +++ b/empty.go @@ -0,0 +1,48 @@ +package docxlib + +import "encoding/xml" + +func emptyRelationships() []*Relationship { + defaultRel := []*Relationship{ + { + ID: "rId1", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`, + Target: "styles.xml", + }, + { + ID: "rId2", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`, + Target: "theme/theme1.xml", + }, + { + ID: "rId3", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`, + Target: "fontTable.xml", + }, + } + return defaultRel +} + +func emptyFile() *Docx { + docx := &Docx{ + Document: Document{ + XMLName: xml.Name{ + Space: "w", + }, + XMLW: XMLNS_W, + XMLR: XMLNS_R, + Body: &Body{ + XMLName: xml.Name{ + Space: "w", + }, + Paragraphs: make([]*Paragraph, 0), + }, + }, + DocRelation: Relationships{ + Xmlns: XMLNS, + Relationships: emptyRelationships(), + }, + rId: 4, + } + return docx +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c416266 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/gonfva/docxlib + +go 1.16 diff --git a/link.go b/link.go new file mode 100644 index 0000000..bc661f4 --- /dev/null +++ b/link.go @@ -0,0 +1,38 @@ +package docxlib + +import "strconv" + +func (f *Docx) addLinkRelation(link string) string { + rel := &Relationship{ + ID: "rId" + strconv.Itoa(f.rId), + Type: REL_HYPERLINK, + Target: link, + TargetMode: REL_TARGETMODE, + } + + f.rId += 1 + + f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel) + + return rel.ID +} + +// AddLink add hyperlink to paragraph +func (p *Paragraph) AddLink(text string, link string) *Hyperlink { + rId := p.file.addLinkRelation(link) + hyperlink := &Hyperlink{ + ID: rId, + Run: Run{ + RunProperties: &RunProperties{ + RunStyle: &RunStyle{ + Val: HYPERLINK_STYLE, + }, + }, + InstrText: text, + }, + } + + p.Data = append(p.Data, ParagraphChild{Link: hyperlink}) + + return hyperlink +} diff --git a/main/main.go b/main/main.go new file mode 100644 index 0000000..2dac108 --- /dev/null +++ b/main/main.go @@ -0,0 +1,56 @@ +package main + +import ( + "fmt" + "os" + + "github.com/gonfva/docxlib" +) + +const FILE_PATH = "/tmp/new-file.docx" + +func main() { + fmt.Printf("Preparing new document to write at %s\n", FILE_PATH) + + w := docxlib.New() + // add new paragraph + para1 := w.AddParagraph() + // add text + para1.AddText("test") + + para1.AddText("test font size").Size(22) + para1.AddText("test color").Color("808080") + para2 := w.AddParagraph() + para2.AddText("test font size and color").Size(22).Color("ff0000") + + nextPara := w.AddParagraph() + nextPara.AddLink("google", `http://google.com`) + + f, err := os.Create(FILE_PATH) + if err != nil { + panic(err) + } + defer f.Close() + w.Write(f) + fmt.Println("Document writen. \nNow trying to read it") + // Now let's try to read the file + readFile, err := os.Open(FILE_PATH) + if err != nil { + panic(err) + } + fileinfo, err := readFile.Stat() + if err != nil { + panic(err) + } + size := fileinfo.Size() + doc, err := docxlib.Parse(readFile, int64(size)) + if err != nil { + panic(err) + } + for _, para := range doc.Paragraphs() { + for _, run := range para.Runs() { + fmt.Printf("\tWe've found a new run with the text ->%s\n", run.Text.Text) + } + } + fmt.Println("End of main") +} diff --git a/pack.go b/pack.go new file mode 100644 index 0000000..206bba6 --- /dev/null +++ b/pack.go @@ -0,0 +1,51 @@ +package docxlib + +import ( + "archive/zip" + "encoding/xml" + "fmt" +) + +func (f *Docx) pack(zipWriter *zip.Writer) (err error) { + files := map[string]string{} + + files["_rels/.rels"] = TEMP_REL + files["docProps/app.xml"] = TEMP_DOCPROPS_APP + files["docProps/core.xml"] = TEMP_DOCPROPS_CORE + files["word/theme/theme1.xml"] = TEMP_WORD_THEME_THEME + files["word/styles.xml"] = TEMP_WORD_STYLE + files["[Content_Types].xml"] = TEMP_CONTENT + files["word/_rels/document.xml.rels"], err = marshal(f.DocRelation) + if err != nil { + return err + } + files["word/document.xml"], err = marshal(f.Document) + if err != nil { + return err + } + + for path, data := range files { + w, err := zipWriter.Create(path) + if err != nil { + return err + } + + _, err = w.Write([]byte(data)) + if err != nil { + return err + } + } + + return +} + +func marshal(data interface{}) (out string, err error) { + body, err := xml.Marshal(data) + if err != nil { + fmt.Println(err) + return + } + + out = xml.Header + string(body) + return +} diff --git a/paragraph.go b/paragraph.go new file mode 100644 index 0000000..dc2bafc --- /dev/null +++ b/paragraph.go @@ -0,0 +1,42 @@ +package docxlib + +import ( + "encoding/xml" +) + +type ParagraphChild struct { + Link *Hyperlink `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink"` + Run *Run `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r"` +} + +type Paragraph struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` + Data []ParagraphChild + + file *Docx +} + +// AddParagraph adds a new paragraph +func (f *Docx) AddParagraph() *Paragraph { + p := &Paragraph{ + Data: make([]ParagraphChild, 0), + file: f, + } + + f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p) + return p +} + +func (f *Docx) Paragraphs() []*Paragraph { + return f.Document.Body.Paragraphs +} + +func (p *Paragraph) Runs() (ret []*Run) { + data := p.Data + for _, d := range data { + if d.Run != nil { + ret = append(ret, d.Run) + } + } + return +} diff --git a/relationship.go b/relationship.go new file mode 100644 index 0000000..8c96b0d --- /dev/null +++ b/relationship.go @@ -0,0 +1,24 @@ +package docxlib + +import "encoding/xml" + +const ( + XMLNS = `http://schemas.openxmlformats.org/package/2006/relationships` + REL_HYPERLINK = `http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink` + + REL_TARGETMODE = "External" +) + +type Relationships struct { + XMLName xml.Name `xml:"Relationships"` + Xmlns string `xml:"xmlns,attr"` + Relationships []*Relationship `xml:"Relationship"` +} + +type Relationship struct { + XMLName xml.Name `xml:"Relationship"` + ID string `xml:"Id,attr"` + Type string `xml:"Type,attr"` + Target string `xml:"Target,attr"` + TargetMode string `xml:"TargetMode,attr,omitempty"` +} diff --git a/run.go b/run.go new file mode 100644 index 0000000..f687061 --- /dev/null +++ b/run.go @@ -0,0 +1,58 @@ +package docxlib + +import "encoding/xml" + +// A Run is part of a paragraph that has its own style. It could be +// a piece of text in bold, or a link +type Run struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r"` + RunProperties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` + InstrText string `xml:"w:instrText,omitempty"` + Text *Text +} + +// The Text object contains the actual text +type Text struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main t"` + XMLSpace string `xml:"xml:space,attr,omitempty"` + Text string `xml:",chardata"` +} + +type Hyperlink struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink"` + ID string `xml:"http://schemas.openxmlformats.org/officeDocument/2006/relationships id,attr"` + Run Run +} + +// Color allows to set run color +func (r *Run) Color(color string) *Run { + r.RunProperties.Color = &Color{ + Val: color, + } + + return r +} + +// Size allows to set run size +func (r *Run) Size(size int) *Run { + r.RunProperties.Size = &Size{ + Val: size * 2, + } + return r +} + +// AddText add text to paragraph +func (p *Paragraph) AddText(text string) *Run { + t := &Text{ + Text: text, + } + + run := &Run{ + Text: t, + RunProperties: &RunProperties{}, + } + + p.Data = append(p.Data, ParagraphChild{Run: run}) + + return run +} diff --git a/run_properties.go b/run_properties.go new file mode 100644 index 0000000..5e3f567 --- /dev/null +++ b/run_properties.go @@ -0,0 +1,29 @@ +package docxlib + +import "encoding/xml" + +const ( + HYPERLINK_STYLE = "a1" +) + +type RunProperties struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr"` + Color *Color `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color,omitempty"` + Size *Size `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz,omitempty"` + RunStyle *RunStyle `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"` +} + +type RunStyle struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle"` + Val string `xml:"w:val,attr"` +} + +type Color struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color"` + Val string `xml:"w:val,attr"` +} + +type Size struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz"` + Val int `xml:"w:val,attr"` +} diff --git a/unpack.go b/unpack.go new file mode 100644 index 0000000..fc4dae5 --- /dev/null +++ b/unpack.go @@ -0,0 +1,77 @@ +package docxlib + +import ( + "archive/zip" + "encoding/xml" + "fmt" + "io/ioutil" +) + +func unpack(zipReader *zip.Reader) (docx *Docx, err error) { + var doc *Document + var relations *Relationships + for _, f := range zipReader.File { + if f.Name == "word/_rels/document.xml.rels" { + relations, err = processRelations(f) + if err != nil { + return nil, err + } + } + if f.Name == "word/document.xml" { + doc, err = processDoc(f) + if err != nil { + return nil, err + } + } + } + docx = &Docx{ + Document: *doc, + DocRelation: *relations, + } + return docx, nil +} + +func processDoc(file *zip.File) (*Document, error) { + filebytes, err := readZipFile(file) + if err != nil { + fmt.Println("Error reading from internal zip file") + return nil, err + } + doc := Document{ + XMLW: XMLNS_W, + XMLR: XMLNS_R, + XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} + err = xml.Unmarshal(filebytes, &doc) + //r := bytes.NewReader(filebytes) + //err = decode(r) + if err != nil { + fmt.Println("Error unmarshalling doc") + fmt.Println(string(filebytes)) + return nil, err + } + return &doc, nil +} + +func processRelations(file *zip.File) (*Relationships, error) { + filebytes, err := readZipFile(file) + if err != nil { + fmt.Println("Error reading from internal zip file") + return nil, err + } + rels := Relationships{Xmlns: "none"} + err = xml.Unmarshal(filebytes, &rels) + if err != nil { + fmt.Println("Error unmarshalling relationships") + return nil, err + } + return &rels, nil +} + +func readZipFile(zf *zip.File) ([]byte, error) { + f, err := zf.Open() + if err != nil { + return nil, err + } + defer f.Close() + return ioutil.ReadAll(f) +}