mirror of
https://github.com/fumiama/go-docx.git
synced 2026-06-27 14:40:24 +08:00
优化代码结构
This commit is contained in:
1
LICENSE
1
LICENSE
@@ -3,6 +3,7 @@ MIT License
|
||||
Copyright (c) 2020 gingfrederik
|
||||
Copyright (c) 2021 Gonzalo Fernandez-Victorio
|
||||
Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com)
|
||||
Copyright (c) 2023 Fumiama Minamoto (源文雨)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go.
|
||||
|
||||
This is a variant optimized and expanded by fumiama. The original repo is [gonfva/docxlib](https://github.com/gonfva/docxlib).
|
||||
|
||||
## Introduction
|
||||
|
||||
As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents.
|
||||
@@ -31,7 +33,7 @@ In the mean time, shared as an example in case somebody finds it useful.
|
||||
Go modules supported
|
||||
|
||||
```sh
|
||||
go get github.com/gonfva/docxlib
|
||||
go get github.com/fumiama/docxlib
|
||||
```
|
||||
|
||||
### Usage
|
||||
@@ -39,8 +41,7 @@ go get github.com/gonfva/docxlib
|
||||
See [main](main/main.go) for an example
|
||||
|
||||
```
|
||||
$ go build -o docxlib ./main
|
||||
$ ./docxlib
|
||||
$ go run ./cmd/main
|
||||
Preparing new document to write at /tmp/new-file.docx
|
||||
Document writen.
|
||||
Now trying to read it
|
||||
@@ -53,7 +54,7 @@ End of main
|
||||
```
|
||||
You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go)
|
||||
```
|
||||
$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
|
||||
$ go build -o docxlib ./cmd/getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
|
||||
I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...]
|
||||
I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...]
|
||||
I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340]
|
||||
|
||||
11
apilink.go
11
apilink.go
@@ -1,18 +1,19 @@
|
||||
package docxlib
|
||||
|
||||
import "strconv"
|
||||
import (
|
||||
"strconv"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// when adding an hyperlink we need to store a reference in the relationship field
|
||||
func (f *DocxLib) addLinkRelation(link string) string {
|
||||
func (f *Docx) addLinkRelation(link string) string {
|
||||
rel := &Relationship{
|
||||
ID: "rId" + strconv.Itoa(f.rId),
|
||||
ID: "rId" + strconv.Itoa(int(atomic.AddUintptr(&f.rId, 1))),
|
||||
Type: REL_HYPERLINK,
|
||||
Target: link,
|
||||
TargetMode: REL_TARGETMODE,
|
||||
}
|
||||
|
||||
f.rId += 1
|
||||
|
||||
f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel)
|
||||
|
||||
return rel.ID
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
package docxlib
|
||||
|
||||
// AddParagraph adds a new paragraph
|
||||
func (f *DocxLib) AddParagraph() *Paragraph {
|
||||
func (f *Docx) AddParagraph() *Paragraph {
|
||||
p := &Paragraph{
|
||||
Data: make([]ParagraphChild, 0),
|
||||
Data: make([]ParagraphChild, 0, 64),
|
||||
file: f,
|
||||
}
|
||||
|
||||
f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p)
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func (f *DocxLib) Paragraphs() []*Paragraph {
|
||||
func (f *Docx) Paragraphs() []*Paragraph {
|
||||
return f.Document.Body.Paragraphs
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ func (r *Run) Size(size int) *Run {
|
||||
r.RunProperties.Size = &Size{
|
||||
Val: size * 2,
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/fumiama/docxlib"
|
||||
"github.com/golang/glog"
|
||||
"github.com/gonfva/docxlib"
|
||||
)
|
||||
|
||||
var fileLocation *string
|
||||
@@ -40,7 +40,7 @@ func main() {
|
||||
if child.Link != nil {
|
||||
id := child.Link.ID
|
||||
text := child.Link.Run.InstrText
|
||||
link, err := doc.References(id)
|
||||
link, err := doc.Refer(id)
|
||||
if err != nil {
|
||||
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
|
||||
} else {
|
||||
@@ -5,7 +5,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/gonfva/docxlib"
|
||||
"github.com/fumiama/docxlib"
|
||||
)
|
||||
|
||||
var fileLocation *string
|
||||
@@ -60,7 +60,7 @@ func main() {
|
||||
if child.Link != nil {
|
||||
id := child.Link.ID
|
||||
text := child.Link.Run.InstrText
|
||||
link, err := doc.References(id)
|
||||
link, err := doc.Refer(id)
|
||||
if err != nil {
|
||||
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
|
||||
} else {
|
||||
47
docxlib.go
47
docxlib.go
@@ -6,33 +6,40 @@ import (
|
||||
"io"
|
||||
)
|
||||
|
||||
// DocxLib is the structure that allow to access the internal represntation
|
||||
var (
|
||||
// ErrRefIDNotFound cannot find such reference
|
||||
ErrRefIDNotFound = errors.New("ref id not found")
|
||||
)
|
||||
|
||||
// Docx is the structure that allow to access the internal represntation
|
||||
// in memory of the doc (either read or about to be written)
|
||||
type DocxLib struct {
|
||||
type Docx struct {
|
||||
Document Document
|
||||
DocRelation Relationships
|
||||
|
||||
rId int
|
||||
rId uintptr
|
||||
}
|
||||
|
||||
// New generates a new empty docx file that we can manipulate and
|
||||
// later on, save
|
||||
func New() *DocxLib {
|
||||
return emptyFile()
|
||||
func New() *Docx {
|
||||
return newEmptyFile()
|
||||
}
|
||||
|
||||
// Parse generates a new docx file in memory from a reader
|
||||
// You can it invoke from a file
|
||||
// readFile, err := os.Open(FILE_PATH)
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// fileinfo, err := readFile.Stat()
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// size := fileinfo.Size()
|
||||
// doc, err := docxlib.Parse(readFile, int64(size))
|
||||
//
|
||||
// readFile, err := os.Open(FILE_PATH)
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// fileinfo, err := readFile.Stat()
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// size := fileinfo.Size()
|
||||
// doc, err := docxlib.Parse(readFile, int64(size))
|
||||
//
|
||||
// but also you can invoke from a webform (BEWARE of trusting users data!!!)
|
||||
//
|
||||
// func uploadFile(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -48,7 +55,7 @@ func New() *DocxLib {
|
||||
// defer file.Close()
|
||||
// docxlib.Parse(file, handler.Size)
|
||||
// }
|
||||
func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
|
||||
func Parse(reader io.ReaderAt, size int64) (doc *Docx, err error) {
|
||||
zipReader, err := zip.NewReader(reader, size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -58,21 +65,21 @@ func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
|
||||
}
|
||||
|
||||
// Write allows to save a docx to a writer
|
||||
func (f *DocxLib) Write(writer io.Writer) (err error) {
|
||||
func (f *Docx) Write(writer io.Writer) (err error) {
|
||||
zipWriter := zip.NewWriter(writer)
|
||||
defer zipWriter.Close()
|
||||
|
||||
return f.pack(zipWriter)
|
||||
}
|
||||
|
||||
// References gets the url for a reference
|
||||
func (f *DocxLib) References(id string) (href string, err error) {
|
||||
// Refer gets the url for a reference
|
||||
func (f *Docx) Refer(id string) (href string, err error) {
|
||||
for _, a := range f.DocRelation.Relationships {
|
||||
if a.ID == id {
|
||||
href = a.Target
|
||||
return
|
||||
}
|
||||
}
|
||||
err = errors.New("id not found")
|
||||
err = ErrRefIDNotFound
|
||||
return
|
||||
}
|
||||
|
||||
50
empty.go
50
empty.go
@@ -2,29 +2,8 @@ package docxlib
|
||||
|
||||
import "encoding/xml"
|
||||
|
||||
func emptyRelationships() []*Relationship {
|
||||
defaultRel := []*Relationship{
|
||||
{
|
||||
ID: "rId1",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
|
||||
Target: "styles.xml",
|
||||
},
|
||||
{
|
||||
ID: "rId2",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
|
||||
Target: "theme/theme1.xml",
|
||||
},
|
||||
{
|
||||
ID: "rId3",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
|
||||
Target: "fontTable.xml",
|
||||
},
|
||||
}
|
||||
return defaultRel
|
||||
}
|
||||
|
||||
func emptyFile() *DocxLib {
|
||||
docx := &DocxLib{
|
||||
func newEmptyFile() *Docx {
|
||||
return &Docx{
|
||||
Document: Document{
|
||||
XMLName: xml.Name{
|
||||
Space: "w",
|
||||
@@ -35,14 +14,29 @@ func emptyFile() *DocxLib {
|
||||
XMLName: xml.Name{
|
||||
Space: "w",
|
||||
},
|
||||
Paragraphs: make([]*Paragraph, 0),
|
||||
Paragraphs: make([]*Paragraph, 0, 64),
|
||||
},
|
||||
},
|
||||
DocRelation: Relationships{
|
||||
Xmlns: XMLNS,
|
||||
Relationships: emptyRelationships(),
|
||||
Xmlns: XMLNS,
|
||||
Relationships: []*Relationship{
|
||||
{
|
||||
ID: "rId1",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
|
||||
Target: "styles.xml",
|
||||
},
|
||||
{
|
||||
ID: "rId2",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
|
||||
Target: "theme/theme1.xml",
|
||||
},
|
||||
{
|
||||
ID: "rId3",
|
||||
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
|
||||
Target: "fontTable.xml",
|
||||
},
|
||||
},
|
||||
},
|
||||
rId: 4,
|
||||
rId: 3,
|
||||
}
|
||||
return docx
|
||||
}
|
||||
|
||||
2
go.mod
2
go.mod
@@ -1,4 +1,4 @@
|
||||
module github.com/gonfva/docxlib
|
||||
module github.com/fumiama/docxlib
|
||||
|
||||
go 1.16
|
||||
|
||||
|
||||
20
helper.go
Normal file
20
helper.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package docxlib
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// BytesToString 没有内存开销的转换
|
||||
func BytesToString(b []byte) string {
|
||||
return *(*string)(unsafe.Pointer(&b))
|
||||
}
|
||||
|
||||
// StringToBytes 没有内存开销的转换
|
||||
func StringToBytes(s string) (b []byte) {
|
||||
bh := (*slice)(unsafe.Pointer(&b))
|
||||
sh := (*slice)(unsafe.Pointer(&s))
|
||||
bh.data = sh.data
|
||||
bh.len = sh.len
|
||||
bh.cap = sh.len
|
||||
return b
|
||||
}
|
||||
12
pack.go
12
pack.go
@@ -3,6 +3,7 @@ package docxlib
|
||||
import (
|
||||
"archive/zip"
|
||||
"encoding/xml"
|
||||
"strings"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
@@ -10,7 +11,7 @@ import (
|
||||
// This receives a zip file writer (word documents are a zip with multiple xml inside)
|
||||
// and writes the relevant files. Some of them come from the empty_constants file,
|
||||
// others from the actual in-memory structure
|
||||
func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
|
||||
func (f *Docx) pack(zipWriter *zip.Writer) (err error) {
|
||||
files := map[string]string{}
|
||||
|
||||
files["_rels/.rels"] = TEMP_REL
|
||||
@@ -34,7 +35,7 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = w.Write([]byte(data))
|
||||
_, err = w.Write(StringToBytes(data))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -44,12 +45,13 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
|
||||
}
|
||||
|
||||
func marshal(data interface{}) (out string, err error) {
|
||||
body, err := xml.Marshal(data)
|
||||
sb := strings.Builder{}
|
||||
sb.WriteString(xml.Header)
|
||||
err = xml.NewEncoder(&sb).Encode(data)
|
||||
if err != nil {
|
||||
glog.Errorln("Error marshalling", err)
|
||||
return
|
||||
}
|
||||
|
||||
out = xml.Header + string(body)
|
||||
out = sb.String()
|
||||
return
|
||||
}
|
||||
|
||||
15
slice.go
Normal file
15
slice.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package docxlib
|
||||
|
||||
import "unsafe"
|
||||
|
||||
// slice is the runtime representation of a slice.
|
||||
// It cannot be used safely or portably and its representation may
|
||||
// change in a later release.
|
||||
//
|
||||
// Unlike reflect.SliceHeader, its Data field is sufficient to guarantee the
|
||||
// data it references will not be garbage collected.
|
||||
type slice struct {
|
||||
data unsafe.Pointer
|
||||
len int
|
||||
cap int
|
||||
}
|
||||
@@ -17,11 +17,11 @@ type Paragraph struct {
|
||||
XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"`
|
||||
Data []ParagraphChild
|
||||
|
||||
file *DocxLib
|
||||
file *Docx
|
||||
}
|
||||
|
||||
func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
children := make([]ParagraphChild, 0)
|
||||
children := make([]ParagraphChild, 0, 64)
|
||||
for {
|
||||
t, err := d.Token()
|
||||
if err == io.EOF {
|
||||
@@ -30,7 +30,8 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
switch tt := t.(type) {
|
||||
case xml.StartElement:
|
||||
var elem ParagraphChild
|
||||
if tt.Name.Local == "hyperlink" {
|
||||
switch tt.Name.Local {
|
||||
case "hyperlink":
|
||||
var value Hyperlink
|
||||
d.DecodeElement(&value, &start)
|
||||
id := getAtt(tt.Attr, "id")
|
||||
@@ -41,20 +42,20 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
if anchor != "" {
|
||||
value.ID = anchor
|
||||
}
|
||||
elem = ParagraphChild{Link: &value}
|
||||
} else if tt.Name.Local == "r" {
|
||||
elem.Link = &value
|
||||
case "r":
|
||||
var value Run
|
||||
d.DecodeElement(&value, &start)
|
||||
elem = ParagraphChild{Run: &value}
|
||||
elem.Run = &value
|
||||
if value.InstrText == "" && value.Text == nil {
|
||||
glog.V(0).Infof("Empty run, we ignore")
|
||||
continue
|
||||
}
|
||||
} else if tt.Name.Local == "rPr" {
|
||||
case "rPr":
|
||||
var value RunProperties
|
||||
d.DecodeElement(&value, &start)
|
||||
elem = ParagraphChild{Properties: &value}
|
||||
} else {
|
||||
elem.Properties = &value
|
||||
default:
|
||||
continue
|
||||
}
|
||||
children = append(children, elem)
|
||||
|
||||
12
structrun.go
12
structrun.go
@@ -76,19 +76,20 @@ func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
|
||||
switch tt := t.(type) {
|
||||
case xml.StartElement:
|
||||
if tt.Name.Local == "rPr" {
|
||||
switch tt.Name.Local {
|
||||
case "rPr":
|
||||
var value RunProperties
|
||||
d.DecodeElement(&value, &start)
|
||||
elem.RunProperties = &value
|
||||
} else if tt.Name.Local == "instrText" {
|
||||
case "instrText":
|
||||
var value string
|
||||
d.DecodeElement(&value, &start)
|
||||
elem.InstrText = value
|
||||
} else if tt.Name.Local == "t" {
|
||||
case "t":
|
||||
var value Text
|
||||
d.DecodeElement(&value, &start)
|
||||
elem.Text = &value
|
||||
} else {
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -109,8 +110,7 @@ func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
||||
|
||||
switch tt := t.(type) {
|
||||
case xml.CharData:
|
||||
cd := tt.Copy()
|
||||
elem.Text = string(cd)
|
||||
elem.Text = string(tt) // implicitly copy
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
53
unpack.go
53
unpack.go
@@ -4,7 +4,7 @@ package docxlib
|
||||
import (
|
||||
"archive/zip"
|
||||
"encoding/xml"
|
||||
"io/ioutil"
|
||||
"io"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
@@ -13,68 +13,63 @@ import (
|
||||
// and parses the files that are relevant for us:
|
||||
// 1.-Document
|
||||
// 2.-Relationships
|
||||
func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) {
|
||||
var doc *Document
|
||||
var relations *Relationships
|
||||
func unpack(zipReader *zip.Reader) (docx *Docx, err error) {
|
||||
docx = new(Docx)
|
||||
for _, f := range zipReader.File {
|
||||
if f.Name == "word/_rels/document.xml.rels" {
|
||||
relations, err = processRelations(f)
|
||||
err = processRelations(f, &docx.DocRelation)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return
|
||||
}
|
||||
}
|
||||
if f.Name == "word/document.xml" {
|
||||
doc, err = processDoc(f)
|
||||
err = processDoc(f, &docx.Document)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
docx = &DocxLib{
|
||||
Document: *doc,
|
||||
DocRelation: *relations,
|
||||
}
|
||||
return docx, nil
|
||||
return
|
||||
}
|
||||
|
||||
// Processes one of the relevant files, the one with the actual document
|
||||
func processDoc(file *zip.File) (*Document, error) {
|
||||
func processDoc(file *zip.File, doc *Document) error {
|
||||
filebytes, err := readZipFile(file)
|
||||
if err != nil {
|
||||
glog.Errorln("Error reading from internal zip file")
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
glog.V(0).Infoln("Doc:", string(filebytes))
|
||||
|
||||
doc := Document{
|
||||
XMLW: XMLNS_W,
|
||||
XMLR: XMLNS_R,
|
||||
XMLName: xml.Name{Space: XMLNS_W, Local: "document"}}
|
||||
err = xml.Unmarshal(filebytes, &doc)
|
||||
doc.XMLW = XMLNS_W
|
||||
doc.XMLR = XMLNS_R
|
||||
doc.XMLName.Space = XMLNS_W
|
||||
doc.XMLName.Local = "document"
|
||||
err = xml.Unmarshal(filebytes, doc)
|
||||
if err != nil {
|
||||
glog.Errorln("Error unmarshalling doc", string(filebytes))
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs)
|
||||
return &doc, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
// Processes one of the relevant files, the one with the relationships
|
||||
func processRelations(file *zip.File) (*Relationships, error) {
|
||||
func processRelations(file *zip.File, rels *Relationships) error {
|
||||
filebytes, err := readZipFile(file)
|
||||
if err != nil {
|
||||
glog.Errorln("Error reading from internal zip file")
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
glog.V(0).Infoln("Relations:", string(filebytes))
|
||||
|
||||
rels := Relationships{Xmlns: XMLNS_R}
|
||||
err = xml.Unmarshal(filebytes, &rels)
|
||||
rels.Xmlns = XMLNS_R
|
||||
err = xml.Unmarshal(filebytes, rels)
|
||||
if err != nil {
|
||||
glog.Errorln("Error unmarshalling relationships")
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
return &rels, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
// From a zip file structure, we return a byte array
|
||||
@@ -84,5 +79,5 @@ func readZipFile(zf *zip.File) ([]byte, error) {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
return ioutil.ReadAll(f)
|
||||
return io.ReadAll(f)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user