mirror of
https://github.com/larksuite/cli.git
synced 2026-07-03 14:02:43 +08:00
feat(sheets): add --dataframe Arrow IPC input for +table-put/+table-get/+workbook-create
Introduce a binary-typed twin of --sheets: --dataframe accepts an Arrow IPC (Feather v2) payload that pandas' df.to_feather() writes, deriving dtypes and per-column number formats from the Arrow schema. The two producers are mutually exclusive and funnel through a shared resolver so +table-put and +workbook-create stay in lockstep; +table-get gains --dataframe-out for single-sheet reads. Also auto-grow a sub-sheet's row/column count before writing so blocks past the backend's default 200x20 bounds no longer fail with range-exceeds-sheet-bounds.
This commit is contained in:
12
go.mod
12
go.mod
@@ -27,6 +27,8 @@ require (
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
)
|
||||
|
||||
require github.com/apache/arrow/go/v17 v17.0.0
|
||||
|
||||
require (
|
||||
github.com/atotto/clipboard v0.1.4 // indirect
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
@@ -42,13 +44,17 @@ require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/goccy/go-json v0.10.3 // indirect
|
||||
github.com/godbus/dbus/v5 v5.2.2 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/google/flatbuffers v24.3.25+incompatible // indirect
|
||||
github.com/gopherjs/gopherjs v1.17.2 // indirect
|
||||
github.com/gorilla/websocket v1.5.0 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/itchyny/timefmt-go v0.1.6 // indirect
|
||||
github.com/jtolds/gls v4.20.0+incompatible // indirect
|
||||
github.com/klauspost/compress v1.17.9 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
@@ -57,10 +63,16 @@ require (
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.16.0 // indirect
|
||||
github.com/pierrec/lz4/v4 v4.1.21 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/smarty/assertions v1.15.0 // indirect
|
||||
github.com/tidwall/match v1.1.1 // indirect
|
||||
github.com/tidwall/pretty v1.2.0 // indirect
|
||||
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
|
||||
github.com/zeebo/xxh3 v1.0.2 // indirect
|
||||
golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 // indirect
|
||||
golang.org/x/mod v0.18.0 // indirect
|
||||
golang.org/x/tools v0.22.0 // indirect
|
||||
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
|
||||
)
|
||||
|
||||
32
go.sum
32
go.sum
@@ -2,6 +2,8 @@ github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ
|
||||
github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/apache/arrow/go/v17 v17.0.0 h1:RRR2bdqKcdbss9Gxy2NS/hK8i4LDMh23L6BbkN5+F54=
|
||||
github.com/apache/arrow/go/v17 v17.0.0/go.mod h1:jR7QHkODl15PfYyjM2nU+yTLScZ/qfj7OSUZmJ8putc=
|
||||
github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
|
||||
github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
@@ -52,12 +54,16 @@ github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkp
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
||||
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
|
||||
github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
|
||||
github.com/godbus/dbus/v5 v5.2.2 h1:TUR3TgtSVDmjiXOgAAyaZbYmIeP3DPkld3jgKGV8mXQ=
|
||||
github.com/godbus/dbus/v5 v5.2.2/go.mod h1:3AAv2+hPq5rdnr5txxxRwiGjPXamgoIHgz9FPBfOp3c=
|
||||
github.com/gofrs/flock v0.8.1 h1:+gYjHKf32LDeiEEFhQaotPbLuUXjY5ZqxKgXy7n59aw=
|
||||
github.com/gofrs/flock v0.8.1/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
|
||||
github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gopherjs/gopherjs v1.17.2 h1:fQnZVsXk8uxXIStYb0N4bGk7jeyTalG/wsZjQ25dO0g=
|
||||
@@ -74,11 +80,16 @@ github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7
|
||||
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
|
||||
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
|
||||
github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
|
||||
github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
|
||||
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/larksuite/oapi-sdk-go/v3 v3.5.4 h1:U2S9x9LrfH++ZqJ+YAiUlqzCWJmVXhFdS8Z7rIBH8H0=
|
||||
github.com/larksuite/oapi-sdk-go/v3 v3.5.4/go.mod h1:ZEplY+kwuIrj/nqw5uSCINNATcH3KdxSN7y+UxYY5fI=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
@@ -97,6 +108,8 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
|
||||
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
|
||||
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
|
||||
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
@@ -133,14 +146,20 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs=
|
||||
github.com/zalando/go-keyring v0.2.8/go.mod h1:tsMo+VpRq5NGyKfxoBVjCuMrG47yj8cmakZDO5QGii0=
|
||||
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
|
||||
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
|
||||
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
|
||||
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
|
||||
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
|
||||
golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ=
|
||||
golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
|
||||
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
@@ -156,6 +175,7 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
|
||||
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
|
||||
@@ -169,10 +189,16 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA=
|
||||
golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU=
|
||||
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
|
||||
gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ=
|
||||
gonum.org/v1/gonum v0.15.0/go.mod h1:xzZVBJBtS+Mz4q0Yl2LJTk+OxOg4jiXZ7qBoM0uISGo=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
|
||||
@@ -526,7 +526,7 @@
|
||||
"kind": "own",
|
||||
"type": "string",
|
||||
"required": "optional",
|
||||
"desc": "Typed table payload as JSON (same shape as `+table-put`): a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it from a DataFrame via `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. Mutually exclusive with --values. Creates the workbook, then writes typed type-faithful data (dates land as real dates, numbers keep precision).",
|
||||
"desc": "Typed table payload as JSON (same shape as `+table-put`): a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it from a DataFrame via `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. Mutually exclusive with --values and --dataframe. Creates the workbook, then writes typed type-faithful data (dates land as real dates, numbers keep precision).",
|
||||
"input": [
|
||||
"file",
|
||||
"stdin"
|
||||
@@ -543,6 +543,13 @@
|
||||
"stdin"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "dataframe",
|
||||
"kind": "own",
|
||||
"type": "string",
|
||||
"required": "optional",
|
||||
"desc": "Single-sheet typed table from one Arrow IPC file (Feather v2 — what `pandas.DataFrame.to_feather()` writes), mutually exclusive with --values and --sheets. Pass `@<path>` for a file or `-` for binary stdin (same convention as other input flags). Arrow bytes are read raw — no TrimSpace / BOM strip — so the IPC magic survives intact (unlike text input flags). Column types come from the Arrow schema; per-column `number_format` may be set via Arrow field metadata. Creates the workbook and fills its default sheet (`Sheet1` — adopted in place, no empty Sheet1 left behind). For multi-sheet or non-default placement, use `--sheets` instead."
|
||||
},
|
||||
{
|
||||
"name": "dry-run",
|
||||
"kind": "system",
|
||||
@@ -1382,6 +1389,13 @@
|
||||
"required": "optional",
|
||||
"desc": "Treat the first row as data instead of a header (columns get positional names col1, col2, ...)"
|
||||
},
|
||||
{
|
||||
"name": "dataframe-out",
|
||||
"kind": "own",
|
||||
"type": "string",
|
||||
"required": "optional",
|
||||
"desc": "Write the typed table as one Arrow IPC file (Feather v2) instead of the default JSON. Pass `@<path>` for a file or `-` for binary stdout (same convention as other binary I/O flags). Mirror of the input-side `--dataframe` on `+table-put` / `+workbook-create` — pandas users round-trip via `df = pd.read_feather(\"x.arrow\")` or `pd.read_feather(io.BytesIO(stdout))`. Single-sheet only: requires `--sheet-id` or `--sheet-name`; whole-workbook reads keep the default JSON path. Column types come from the typed read-back (string/number/date/bool); per-column `number_format` is preserved as Arrow field metadata so the Arrow file can round-trip straight back through `+table-put --dataframe`."
|
||||
},
|
||||
{
|
||||
"name": "dry-run",
|
||||
"kind": "system",
|
||||
@@ -2064,13 +2078,20 @@
|
||||
"name": "sheets",
|
||||
"kind": "own",
|
||||
"type": "string",
|
||||
"required": "required",
|
||||
"desc": "Typed table payload (pandas-DataFrame-shaped) as JSON: a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it with `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. `dtypes` values are pandas dtype strings (`int64`, `float64`, `Int64`, `bool`, `boolean`, `datetime64[ns]`, `object`, ...); the writer maps them to internal string/number/date/bool — omit `dtypes` and a column writes as text (good for raw CSV-shaped data). `formats[col]` is an Excel number_format string (e.g. `#,##0.00`, `0.0%`, `yyyy-mm`); when absent, date columns default to `yyyy-mm-dd` and string columns to text format (`@`).",
|
||||
"required": "xor",
|
||||
"desc": "Typed table payload (pandas-DataFrame-shaped) as JSON, XOR with `--dataframe`: a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it with `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. `dtypes` values are pandas dtype strings (`int64`, `float64`, `Int64`, `bool`, `boolean`, `datetime64[ns]`, `object`, ...); the writer maps them to internal string/number/date/bool — omit `dtypes` and a column writes as text (good for raw CSV-shaped data). `formats[col]` is an Excel number_format string (e.g. `#,##0.00`, `0.0%`, `yyyy-mm`); when absent, date columns default to `yyyy-mm-dd` and string columns to text format (`@`).",
|
||||
"input": [
|
||||
"file",
|
||||
"stdin"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "dataframe",
|
||||
"kind": "own",
|
||||
"type": "string",
|
||||
"required": "xor",
|
||||
"desc": "Single-sheet typed table from one Arrow IPC file (a.k.a. Feather v2 — what `pandas.DataFrame.to_feather()` writes), XOR with `--sheets`. Pass `@<path>` for a file or `-` for binary stdin (same convention as other input flags). Arrow bytes are read raw — no TrimSpace / BOM strip — so the IPC magic survives intact (unlike text input flags). Column types come from the Arrow schema (int*/uint*/float* → number, date32/date64/timestamp → date, utf8/large_utf8 → string, bool → bool); per-column `number_format` may be set via Arrow field metadata (`pa.field(\"price\", pa.float64(), metadata={b\"number_format\": b\"$#,##0.00\"})`). Writes the sheet at default placement: name `Sheet1` (created when absent), overwrite from A1 with header. For a different sheet name, anchor, mode, or to write multiple sheets, use `--sheets` instead."
|
||||
},
|
||||
{
|
||||
"name": "dry-run",
|
||||
"kind": "system",
|
||||
|
||||
@@ -923,6 +923,7 @@ var flagDefs = map[string]commandDef{
|
||||
{Name: "sheet-name", Kind: "own", Type: "string", Required: "optional", Desc: "Read only this sheet (by name); omit to read all sheets"},
|
||||
{Name: "range", Kind: "own", Type: "string", Required: "optional", Desc: "A1 range to read; omit to read each sheet current region"},
|
||||
{Name: "no-header", Kind: "own", Type: "bool", Required: "optional", Desc: "Treat the first row as data instead of a header (columns get positional names col1, col2, ...)"},
|
||||
{Name: "dataframe-out", Kind: "own", Type: "string", Required: "optional", Desc: "Write the typed table as one Arrow IPC file (Feather v2) instead of the default JSON. Pass `@<path>` for a file or `-` for binary stdout (same convention as other binary I/O flags). Mirror of the input-side `--dataframe` on `+table-put` / `+workbook-create` — pandas users round-trip via `df = pd.read_feather(\"x.arrow\")` or `pd.read_feather(io.BytesIO(stdout))`. Single-sheet only: requires `--sheet-id` or `--sheet-name`; whole-workbook reads keep the default JSON path. Column types come from the typed read-back (string/number/date/bool); per-column `number_format` is preserved as Arrow field metadata so the Arrow file can round-trip straight back through `+table-put --dataframe`."},
|
||||
{Name: "dry-run", Kind: "system", Type: "bool", Required: "optional"},
|
||||
},
|
||||
},
|
||||
@@ -931,7 +932,8 @@ var flagDefs = map[string]commandDef{
|
||||
Flags: []flagDef{
|
||||
{Name: "url", Kind: "public", Type: "string", Required: "xor", Desc: "Spreadsheet URL to write into (XOR with `--spreadsheet-token`)"},
|
||||
{Name: "spreadsheet-token", Kind: "public", Type: "string", Required: "xor", Desc: "Spreadsheet token to write into (XOR with `--url`)"},
|
||||
{Name: "sheets", Kind: "own", Type: "string", Required: "required", Desc: "Typed table payload (pandas-DataFrame-shaped) as JSON: a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it with `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. `dtypes` values are pandas dtype strings (`int64`, `float64`, `Int64`, `bool`, `boolean`, `datetime64[ns]`, `object`, ...); the writer maps them to internal string/number/date/bool — omit `dtypes` and a column writes as text (good for raw CSV-shaped data). `formats[col]` is an Excel number_format string (e.g. `#,##0.00`, `0.0%`, `yyyy-mm`); when absent, date columns default to `yyyy-mm-dd` and string columns to text format (`@`).", Input: []string{"file", "stdin"}},
|
||||
{Name: "sheets", Kind: "own", Type: "string", Required: "xor", Desc: "Typed table payload (pandas-DataFrame-shaped) as JSON, XOR with `--dataframe`: a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it with `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. `dtypes` values are pandas dtype strings (`int64`, `float64`, `Int64`, `bool`, `boolean`, `datetime64[ns]`, `object`, ...); the writer maps them to internal string/number/date/bool — omit `dtypes` and a column writes as text (good for raw CSV-shaped data). `formats[col]` is an Excel number_format string (e.g. `#,##0.00`, `0.0%`, `yyyy-mm`); when absent, date columns default to `yyyy-mm-dd` and string columns to text format (`@`).", Input: []string{"file", "stdin"}},
|
||||
{Name: "dataframe", Kind: "own", Type: "string", Required: "xor", Desc: "Single-sheet typed table from one Arrow IPC file (a.k.a. Feather v2 — what `pandas.DataFrame.to_feather()` writes), XOR with `--sheets`. Pass `@<path>` for a file or `-` for binary stdin (same convention as other input flags). Arrow bytes are read raw — no TrimSpace / BOM strip — so the IPC magic survives intact (unlike text input flags). Column types come from the Arrow schema (int*/uint*/float* → number, date32/date64/timestamp → date, utf8/large_utf8 → string, bool → bool); per-column `number_format` may be set via Arrow field metadata (`pa.field(\"price\", pa.float64(), metadata={b\"number_format\": b\"$#,##0.00\"})`). Writes the sheet at default placement: name `Sheet1` (created when absent), overwrite from A1 with header. For a different sheet name, anchor, mode, or to write multiple sheets, use `--sheets` instead."},
|
||||
{Name: "dry-run", Kind: "system", Type: "bool", Required: "optional"},
|
||||
},
|
||||
},
|
||||
@@ -941,8 +943,9 @@ var flagDefs = map[string]commandDef{
|
||||
{Name: "title", Kind: "own", Type: "string", Required: "required", Desc: "Spreadsheet title"},
|
||||
{Name: "folder-token", Kind: "own", Type: "string", Required: "optional", Desc: "Target folder token; placed at the drive root when omitted"},
|
||||
{Name: "values", Kind: "own", Type: "string", Required: "optional", Desc: "Untyped initial data as one 2D JSON array (`[[\"alice\",95]]`); values are written as-is with their type auto-detected, through the same batched set_cell_range path as --sheets — pair with --styles for number formats, colors, merges, and row/col sizes", Input: []string{"file", "stdin"}},
|
||||
{Name: "sheets", Kind: "own", Type: "string", Required: "optional", Desc: "Typed table payload as JSON (same shape as `+table-put`): a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it from a DataFrame via `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. Mutually exclusive with --values. Creates the workbook, then writes typed type-faithful data (dates land as real dates, numbers keep precision).", Input: []string{"file", "stdin"}},
|
||||
{Name: "sheets", Kind: "own", Type: "string", Required: "optional", Desc: "Typed table payload as JSON (same shape as `+table-put`): a top-level `sheets` array, each item `{name, start_cell?, mode?, header?, allow_overwrite?, columns:[\"colA\",\"colB\",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`. Agents typically build it from a DataFrame via `{**json.loads(df.to_json(orient=\"split\")), \"dtypes\": df.dtypes.astype(str).to_dict()}`. Mutually exclusive with --values and --dataframe. Creates the workbook, then writes typed type-faithful data (dates land as real dates, numbers keep precision).", Input: []string{"file", "stdin"}},
|
||||
{Name: "styles", Kind: "own", Type: "string", Required: "optional", Desc: "Initial visual operations as JSON: top-level `{styles:[...]}`. Each item corresponds to one target sheet and must include `name`, plus at least one of `cell_styles` / `row_sizes` / `col_sizes` / `cell_merges`. `cell_styles` entries use +cells-set-style fields with a cell range; row/col sizes use dimension ranges plus type/size; merges use cell ranges plus optional merge_type. With --sheets, styles array length/order/name must match --sheets.sheets. With --values, pass exactly one styles item for the initial sheet (its name is ignored).", Input: []string{"file", "stdin"}},
|
||||
{Name: "dataframe", Kind: "own", Type: "string", Required: "optional", Desc: "Single-sheet typed table from one Arrow IPC file (Feather v2 — what `pandas.DataFrame.to_feather()` writes), mutually exclusive with --values and --sheets. Pass `@<path>` for a file or `-` for binary stdin (same convention as other input flags). Arrow bytes are read raw — no TrimSpace / BOM strip — so the IPC magic survives intact (unlike text input flags). Column types come from the Arrow schema; per-column `number_format` may be set via Arrow field metadata. Creates the workbook and fills its default sheet (`Sheet1` — adopted in place, no empty Sheet1 left behind). For multi-sheet or non-default placement, use `--sheets` instead."},
|
||||
{Name: "dry-run", Kind: "system", Type: "bool", Required: "optional"},
|
||||
},
|
||||
},
|
||||
|
||||
553
shortcuts/sheets/lark_sheet_dataframe.go
Normal file
553
shortcuts/sheets/lark_sheet_dataframe.go
Normal file
@@ -0,0 +1,553 @@
|
||||
// Copyright (c) 2026 Lark Technologies Pte. Ltd.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package sheets
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/apache/arrow/go/v17/arrow"
|
||||
"github.com/apache/arrow/go/v17/arrow/array"
|
||||
"github.com/apache/arrow/go/v17/arrow/ipc"
|
||||
"github.com/apache/arrow/go/v17/arrow/memory"
|
||||
|
||||
"github.com/larksuite/cli/extension/fileio"
|
||||
"github.com/larksuite/cli/internal/cmdutil"
|
||||
"github.com/larksuite/cli/shortcuts/common"
|
||||
)
|
||||
|
||||
// ─── --dataframe (Arrow IPC / Feather v2 binary input) ────────────────
|
||||
//
|
||||
// --dataframe is the binary-typed twin of --sheets. The wire payload is one
|
||||
// Arrow IPC file (a.k.a. Feather v2 — what `pandas.DataFrame.to_feather()`
|
||||
// writes), single schema, optionally multi-batch. Type / format are read off
|
||||
// the Arrow schema (no separate dtypes/formats maps), and per-column number
|
||||
// format can be set via the field's `number_format` metadata key:
|
||||
//
|
||||
// pa.field("price", pa.float64(), metadata={b"number_format": b"$#,##0.00"})
|
||||
//
|
||||
// One DataFrame writes into one sub-sheet at fixed defaults: name `Sheet1`
|
||||
// (adopted in place by +workbook-create; created when absent by +table-put),
|
||||
// overwrite from A1 with header on, allow_overwrite=true. The shortcut
|
||||
// surface is deliberately the one flag — anything that needs a different
|
||||
// sheet name / anchor / mode / multi-sheet falls back to --sheets, whose
|
||||
// JSON payload already carries every knob.
|
||||
//
|
||||
// Binary IO note: --dataframe bypasses the text-oriented Input resolver
|
||||
// (`runtime.Str("dataframe")` carries a *path*, not file contents). Reading
|
||||
// the Arrow bytes through that resolver would TrimSpace the trailing IPC
|
||||
// magic / corrupt non-UTF8 bytes. Path → FileIO.Open → io.ReadAll keeps the
|
||||
// stream byte-exact. "-" reads from stdin directly.
|
||||
|
||||
// dataframeDefaultSheetName is the sub-sheet name --dataframe writes into.
|
||||
// Matches valuesSheetName so +workbook-create adopts the brand-new
|
||||
// workbook's default sheet in place (no stray empty Sheet1 left behind);
|
||||
// +table-put creates Sheet1 if it doesn't already exist.
|
||||
const dataframeDefaultSheetName = valuesSheetName
|
||||
|
||||
// parseDataframePayload reads the --dataframe path (Arrow IPC file) and
|
||||
// composes a single-sheet tablePayload at the fixed default placement.
|
||||
// Network-free: safe from Validate and DryRun. The resulting tableSheetSpec
|
||||
// rides the same buildSheetMatrix / buildTypedCell path as a --sheets entry,
|
||||
// so downstream is unaware of where the rows came from.
|
||||
func parseDataframePayload(rctx *common.RuntimeContext) (*tablePayload, error) {
|
||||
raw := strings.TrimSpace(rctx.Str("dataframe"))
|
||||
if raw == "" {
|
||||
return nil, common.FlagErrorf("--dataframe is required")
|
||||
}
|
||||
data, err := readDataframeBytes(rctx, raw)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
spec, err := decodeArrowToSheet(data, dataframeDefaultSheetName)
|
||||
if err != nil {
|
||||
return nil, common.FlagErrorf("--dataframe: %v", err)
|
||||
}
|
||||
payload := &tablePayload{Sheets: []tableSheetSpec{spec}}
|
||||
if err := payload.validate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return payload, nil
|
||||
}
|
||||
|
||||
// dataframeStdinCache holds the bytes read from stdin on the first call so a
|
||||
// later call (Validate → Execute / DryRun) gets the same bytes instead of an
|
||||
// empty stream — stdin is single-shot, but parseDataframePayload runs
|
||||
// multiple times per command invocation. Process-wide is fine: lark-cli is
|
||||
// one-shot (one command per process). Tests reset by setting it back to nil.
|
||||
var dataframeStdinCache []byte
|
||||
|
||||
// readDataframeBytes resolves --dataframe to raw binary. A literal `@` prefix
|
||||
// is tolerated for symmetry with --sheets (`@/tmp/x.arrow` and `/tmp/x.arrow`
|
||||
// both work). `-` reads stdin verbatim — cached on first call so Validate /
|
||||
// Execute / DryRun all see the same bytes. Bytes are returned untouched: no
|
||||
// TrimSpace, no BOM strip — both would corrupt an Arrow IPC stream.
|
||||
func readDataframeBytes(rctx *common.RuntimeContext, raw string) ([]byte, error) {
|
||||
if raw == "-" {
|
||||
if dataframeStdinCache != nil {
|
||||
return dataframeStdinCache, nil
|
||||
}
|
||||
io := rctx.IO()
|
||||
if io == nil || io.In == nil {
|
||||
return nil, common.FlagErrorf("--dataframe: stdin is not available")
|
||||
}
|
||||
data, err := readAllBytes(io.In)
|
||||
if err != nil {
|
||||
return nil, common.FlagErrorf("--dataframe: read stdin: %v", err)
|
||||
}
|
||||
if len(data) == 0 {
|
||||
return nil, common.FlagErrorf("--dataframe: stdin is empty")
|
||||
}
|
||||
dataframeStdinCache = data
|
||||
return data, nil
|
||||
}
|
||||
path := strings.TrimPrefix(raw, "@")
|
||||
data, err := cmdutil.ReadInputFile(rctx.FileIO(), path)
|
||||
if err != nil {
|
||||
return nil, common.FlagErrorf("--dataframe: %v", err)
|
||||
}
|
||||
if len(data) == 0 {
|
||||
return nil, common.FlagErrorf("--dataframe: file %q is empty", path)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// readAllBytes is a thin wrapper so tests can fake the io.Reader without
|
||||
// importing io. Mirrors io.ReadAll exactly.
|
||||
func readAllBytes(r io.Reader) ([]byte, error) { return io.ReadAll(r) }
|
||||
|
||||
// decodeArrowToSheet reads `data` as an Arrow IPC file (single schema,
|
||||
// possibly multi-batch) and produces a tableSheetSpec with name + columns +
|
||||
// rows filled in. Sheet placement (start_cell / mode / header / overwrite) is
|
||||
// not touched here — parseDataframePayload layers those on from CLI flags.
|
||||
func decodeArrowToSheet(data []byte, sheetName string) (tableSheetSpec, error) {
|
||||
reader, err := ipc.NewFileReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return tableSheetSpec{}, fmt.Errorf("invalid Arrow IPC file (expected pandas df.to_feather output): %v", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
schema := reader.Schema()
|
||||
if schema == nil || schema.NumFields() == 0 {
|
||||
return tableSheetSpec{}, fmt.Errorf("Arrow schema has no fields")
|
||||
}
|
||||
|
||||
ncols := schema.NumFields()
|
||||
cols := make([]tableColumnSpec, ncols)
|
||||
seen := make(map[string]bool, ncols)
|
||||
for i := 0; i < ncols; i++ {
|
||||
f := schema.Field(i)
|
||||
name := f.Name
|
||||
if strings.TrimSpace(name) == "" {
|
||||
return tableSheetSpec{}, fmt.Errorf("column %d has empty name", i)
|
||||
}
|
||||
if seen[name] {
|
||||
return tableSheetSpec{}, fmt.Errorf("duplicate column name %q", name)
|
||||
}
|
||||
seen[name] = true
|
||||
typ, format, err := arrowFieldToTypeFormat(f)
|
||||
if err != nil {
|
||||
return tableSheetSpec{}, fmt.Errorf("column %q: %v", name, err)
|
||||
}
|
||||
cols[i] = tableColumnSpec{Name: name, Type: typ, Format: format}
|
||||
}
|
||||
|
||||
var rows [][]interface{}
|
||||
for b := 0; b < reader.NumRecords(); b++ {
|
||||
rec, err := reader.RecordAt(b)
|
||||
if err != nil {
|
||||
return tableSheetSpec{}, fmt.Errorf("read record batch %d: %v", b, err)
|
||||
}
|
||||
batchRows, err := arrowRecordToRows(rec, cols)
|
||||
rec.Release()
|
||||
if err != nil {
|
||||
return tableSheetSpec{}, err
|
||||
}
|
||||
rows = append(rows, batchRows...)
|
||||
}
|
||||
|
||||
return tableSheetSpec{Name: sheetName, Columns: cols, Rows: rows}, nil
|
||||
}
|
||||
|
||||
// arrowFieldToTypeFormat maps an Arrow field to the internal (type, format)
|
||||
// pair. The field's `number_format` metadata key — when present — sets the
|
||||
// Excel number_format string verbatim; otherwise sensible defaults are
|
||||
// applied per type (`@` text for strings, `yyyy-mm-dd` for dates).
|
||||
func arrowFieldToTypeFormat(f arrow.Field) (typ, format string, err error) {
|
||||
if v, ok := f.Metadata.GetValue("number_format"); ok {
|
||||
format = strings.TrimSpace(v)
|
||||
}
|
||||
switch f.Type.(type) {
|
||||
case *arrow.StringType, *arrow.LargeStringType:
|
||||
if format == "" {
|
||||
format = "@"
|
||||
}
|
||||
return "string", format, nil
|
||||
case *arrow.BooleanType:
|
||||
return "bool", format, nil
|
||||
case *arrow.Date32Type, *arrow.Date64Type, *arrow.TimestampType:
|
||||
if format == "" {
|
||||
format = "yyyy-mm-dd"
|
||||
}
|
||||
return "date", format, nil
|
||||
}
|
||||
if isArrowNumericType(f.Type) {
|
||||
return "number", format, nil
|
||||
}
|
||||
return "", "", fmt.Errorf("unsupported Arrow type %s (want string/number/date/bool)", f.Type.Name())
|
||||
}
|
||||
|
||||
func isArrowNumericType(t arrow.DataType) bool {
|
||||
switch t.ID() {
|
||||
case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64,
|
||||
arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64,
|
||||
arrow.FLOAT16, arrow.FLOAT32, arrow.FLOAT64:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// arrowRecordToRows transposes one column-batch into row-major
|
||||
// [][]interface{} matched to `cols`. Cells are stamped with the same value
|
||||
// shapes buildTypedCell expects from the JSON path: nil for nulls,
|
||||
// json.Number for numerics (precision-preserving), `yyyy-mm-dd` strings for
|
||||
// dates/timestamps, bool for booleans, string for strings.
|
||||
func arrowRecordToRows(rec arrow.Record, cols []tableColumnSpec) ([][]interface{}, error) {
|
||||
if int(rec.NumCols()) != len(cols) {
|
||||
return nil, fmt.Errorf("record has %d cols, schema declared %d", rec.NumCols(), len(cols))
|
||||
}
|
||||
nrows := int(rec.NumRows())
|
||||
rows := make([][]interface{}, nrows)
|
||||
for r := range rows {
|
||||
rows[r] = make([]interface{}, len(cols))
|
||||
}
|
||||
for c := range cols {
|
||||
arr := rec.Column(c)
|
||||
for r := 0; r < nrows; r++ {
|
||||
if arr.IsNull(r) {
|
||||
continue
|
||||
}
|
||||
v, err := arrowCellValue(arr, r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("row %d column %q: %v", r, cols[c].Name, err)
|
||||
}
|
||||
rows[r][c] = v
|
||||
}
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
func arrowCellValue(arr arrow.Array, i int) (interface{}, error) {
|
||||
switch a := arr.(type) {
|
||||
case *array.String:
|
||||
return a.Value(i), nil
|
||||
case *array.LargeString:
|
||||
return a.Value(i), nil
|
||||
case *array.Boolean:
|
||||
return a.Value(i), nil
|
||||
case *array.Int8:
|
||||
return json.Number(strconv.FormatInt(int64(a.Value(i)), 10)), nil
|
||||
case *array.Int16:
|
||||
return json.Number(strconv.FormatInt(int64(a.Value(i)), 10)), nil
|
||||
case *array.Int32:
|
||||
return json.Number(strconv.FormatInt(int64(a.Value(i)), 10)), nil
|
||||
case *array.Int64:
|
||||
return json.Number(strconv.FormatInt(a.Value(i), 10)), nil
|
||||
case *array.Uint8:
|
||||
return json.Number(strconv.FormatUint(uint64(a.Value(i)), 10)), nil
|
||||
case *array.Uint16:
|
||||
return json.Number(strconv.FormatUint(uint64(a.Value(i)), 10)), nil
|
||||
case *array.Uint32:
|
||||
return json.Number(strconv.FormatUint(uint64(a.Value(i)), 10)), nil
|
||||
case *array.Uint64:
|
||||
return json.Number(strconv.FormatUint(a.Value(i), 10)), nil
|
||||
case *array.Float16:
|
||||
return json.Number(strconv.FormatFloat(float64(a.Value(i).Float32()), 'f', -1, 32)), nil
|
||||
case *array.Float32:
|
||||
return json.Number(strconv.FormatFloat(float64(a.Value(i)), 'f', -1, 32)), nil
|
||||
case *array.Float64:
|
||||
return json.Number(strconv.FormatFloat(a.Value(i), 'f', -1, 64)), nil
|
||||
case *array.Date32:
|
||||
// Date32: days since 1970-01-01 (epoch). Multiply to seconds, format
|
||||
// in UTC so timezone offset can't flip the calendar date.
|
||||
t := time.Unix(int64(a.Value(i))*86400, 0).UTC()
|
||||
return t.Format("2006-01-02"), nil
|
||||
case *array.Date64:
|
||||
t := time.UnixMilli(int64(a.Value(i))).UTC()
|
||||
return t.Format("2006-01-02"), nil
|
||||
case *array.Timestamp:
|
||||
ts := int64(a.Value(i))
|
||||
unit := a.DataType().(*arrow.TimestampType).Unit
|
||||
var t time.Time
|
||||
switch unit {
|
||||
case arrow.Second:
|
||||
t = time.Unix(ts, 0).UTC()
|
||||
case arrow.Millisecond:
|
||||
t = time.UnixMilli(ts).UTC()
|
||||
case arrow.Microsecond:
|
||||
t = time.UnixMicro(ts).UTC()
|
||||
case arrow.Nanosecond:
|
||||
t = time.Unix(0, ts).UTC()
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported timestamp unit %v", unit)
|
||||
}
|
||||
return t.Format("2006-01-02"), nil
|
||||
}
|
||||
return nil, fmt.Errorf("unsupported Arrow array %T", arr)
|
||||
}
|
||||
|
||||
// ─── --dataframe-out (Arrow IPC binary output, mirror of --dataframe) ──
|
||||
//
|
||||
// +table-get's binary read-back: encode one sheet's typed read-back as an
|
||||
// Arrow IPC file (Feather v2), so pandas can `pd.read_feather(path)` /
|
||||
// `pd.read_feather(BytesIO(stdout))` symmetrically with the put side.
|
||||
// Single-sheet only — Arrow IPC carries one schema per file. The JSON path
|
||||
// is unchanged; --dataframe-out swaps the encoder for callers that already
|
||||
// have pandas / pyarrow in their pipeline.
|
||||
|
||||
// encodeSheetMapToArrowIPC turns one readSheetAsSpec output into an Arrow IPC
|
||||
// file blob. Internal column types are recovered from `dtypes` (the wire
|
||||
// proxy for the typed protocol), and per-column `number_format` rides through
|
||||
// as Arrow field metadata so the file feeds straight back into
|
||||
// `+table-put --dataframe`.
|
||||
func encodeSheetMapToArrowIPC(sheet map[string]interface{}) ([]byte, error) {
|
||||
columns, _ := sheet["columns"].([]interface{})
|
||||
if len(columns) == 0 {
|
||||
return nil, fmt.Errorf("sheet has no columns")
|
||||
}
|
||||
dtypes, _ := sheet["dtypes"].(map[string]interface{})
|
||||
formats, _ := sheet["formats"].(map[string]interface{})
|
||||
// `data` arrives as either []interface{} (when the sheet came through a
|
||||
// JSON round-trip / unit-test fixture) or [][]interface{} (the shape
|
||||
// readSheetAsSpec directly emits in production). Accept both — anything
|
||||
// else falls through to a zero-row table.
|
||||
var rawData [][]interface{}
|
||||
switch d := sheet["data"].(type) {
|
||||
case [][]interface{}:
|
||||
rawData = d
|
||||
case []interface{}:
|
||||
rawData = make([][]interface{}, len(d))
|
||||
for i, r := range d {
|
||||
rawData[i], _ = r.([]interface{})
|
||||
}
|
||||
}
|
||||
|
||||
ncols := len(columns)
|
||||
colNames := make([]string, ncols)
|
||||
colTypes := make([]string, ncols)
|
||||
fields := make([]arrow.Field, ncols)
|
||||
for i, c := range columns {
|
||||
name, _ := c.(string)
|
||||
if name == "" {
|
||||
return nil, fmt.Errorf("column %d has empty name", i)
|
||||
}
|
||||
colNames[i] = name
|
||||
dt, _ := dtypes[name].(string)
|
||||
colTypes[i] = dtypeToInternalType(dt)
|
||||
var meta arrow.Metadata
|
||||
if formats != nil {
|
||||
if nf, ok := formats[name].(string); ok && strings.TrimSpace(nf) != "" {
|
||||
meta = arrow.NewMetadata([]string{"number_format"}, []string{nf})
|
||||
}
|
||||
}
|
||||
fields[i] = arrow.Field{
|
||||
Name: name,
|
||||
Type: internalTypeToArrowType(colTypes[i]),
|
||||
Nullable: true,
|
||||
Metadata: meta,
|
||||
}
|
||||
}
|
||||
schema := arrow.NewSchema(fields, nil)
|
||||
|
||||
mem := memory.NewGoAllocator()
|
||||
rb := array.NewRecordBuilder(mem, schema)
|
||||
defer rb.Release()
|
||||
for r, row := range rawData {
|
||||
if len(row) != ncols {
|
||||
return nil, fmt.Errorf("row %d has %d cells, want %d", r, len(row), ncols)
|
||||
}
|
||||
for c := 0; c < ncols; c++ {
|
||||
if err := appendArrowCell(rb.Field(c), colTypes[c], row[c]); err != nil {
|
||||
return nil, fmt.Errorf("row %d column %q: %v", r, colNames[c], err)
|
||||
}
|
||||
}
|
||||
}
|
||||
rec := rb.NewRecord()
|
||||
defer rec.Release()
|
||||
|
||||
var buf bytesWriterSeeker
|
||||
w, err := ipc.NewFileWriter(&buf, ipc.WithSchema(schema), ipc.WithAllocator(mem))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ipc.NewFileWriter: %v", err)
|
||||
}
|
||||
if err := w.Write(rec); err != nil {
|
||||
return nil, fmt.Errorf("write record: %v", err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
return nil, fmt.Errorf("close writer: %v", err)
|
||||
}
|
||||
return buf.buf, nil
|
||||
}
|
||||
|
||||
// dtypeToInternalType inverts typeToDtype so the Arrow encoder can pick an
|
||||
// internal column type from the wire-level dtype string. Unknown / object
|
||||
// falls back to string (lossless: every cell is already typed as such).
|
||||
func dtypeToInternalType(dtype string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(dtype)) {
|
||||
case "float64", "float32", "int64", "int32", "int16", "int8",
|
||||
"uint64", "uint32", "uint16", "uint8":
|
||||
return "number"
|
||||
case "bool", "boolean":
|
||||
return "bool"
|
||||
}
|
||||
if strings.HasPrefix(strings.ToLower(dtype), "datetime") {
|
||||
return "date"
|
||||
}
|
||||
return "string"
|
||||
}
|
||||
|
||||
// internalTypeToArrowType is the put-side dtypeToTypeFormat dual: maps the
|
||||
// internal column type to the Arrow data type the encoder builds a column
|
||||
// with. Numbers go to float64 because +table-get can't tell int from float
|
||||
// from a number_format alone — float64 covers both losslessly for the cell
|
||||
// ranges Lark Sheets accepts.
|
||||
func internalTypeToArrowType(typ string) arrow.DataType {
|
||||
switch typ {
|
||||
case "number":
|
||||
return arrow.PrimitiveTypes.Float64
|
||||
case "date":
|
||||
return arrow.FixedWidthTypes.Date32
|
||||
case "bool":
|
||||
return arrow.FixedWidthTypes.Boolean
|
||||
}
|
||||
return arrow.BinaryTypes.String
|
||||
}
|
||||
|
||||
// appendArrowCell stamps one cell into its column builder. Cell shape matches
|
||||
// what cellToTyped emits on the JSON path: json.Number for numbers, ISO
|
||||
// `yyyy-mm-dd` string for dates, plain string for strings, bool for bools,
|
||||
// nil for empty. Anything off-shape errors so the caller doesn't silently
|
||||
// emit nulls for malformed data.
|
||||
func appendArrowCell(b array.Builder, typ string, v interface{}) error {
|
||||
if v == nil {
|
||||
b.AppendNull()
|
||||
return nil
|
||||
}
|
||||
switch typ {
|
||||
case "string":
|
||||
s, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("string expects string value, got %T", v)
|
||||
}
|
||||
b.(*array.StringBuilder).Append(s)
|
||||
case "number":
|
||||
f, err := arrowNumber(v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
b.(*array.Float64Builder).Append(f)
|
||||
case "date":
|
||||
s, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("date expects ISO yyyy-mm-dd string, got %T", v)
|
||||
}
|
||||
t, err := time.Parse("2006-01-02", strings.TrimSpace(s))
|
||||
if err != nil {
|
||||
return fmt.Errorf("date parse %q: %v", s, err)
|
||||
}
|
||||
b.(*array.Date32Builder).Append(arrow.Date32FromTime(t))
|
||||
case "bool":
|
||||
bb, ok := v.(bool)
|
||||
if !ok {
|
||||
return fmt.Errorf("bool expects bool, got %T", v)
|
||||
}
|
||||
b.(*array.BooleanBuilder).Append(bb)
|
||||
default:
|
||||
return fmt.Errorf("unsupported internal type %q", typ)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// arrowNumber converts the number cell shape readSheetAsSpec emits
|
||||
// (json.Number) plus the float fallback to float64 for the Arrow builder.
|
||||
func arrowNumber(v interface{}) (float64, error) {
|
||||
switch n := v.(type) {
|
||||
case json.Number:
|
||||
f, err := n.Float64()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("number parse %q: %v", n.String(), err)
|
||||
}
|
||||
return f, nil
|
||||
case float64:
|
||||
return n, nil
|
||||
}
|
||||
return 0, fmt.Errorf("number expects numeric value, got %T", v)
|
||||
}
|
||||
|
||||
// bytesWriterSeeker is a 10-line in-memory io.WriteSeeker for
|
||||
// ipc.NewFileWriter, which seeks back to patch a footer offset. Using a
|
||||
// buffer (instead of a temp file or os.Stdout, which isn't seekable) keeps
|
||||
// --dataframe-out's stdout path zero-IO and stays straightforward.
|
||||
type bytesWriterSeeker struct {
|
||||
buf []byte
|
||||
pos int64
|
||||
}
|
||||
|
||||
func (w *bytesWriterSeeker) Write(p []byte) (int, error) {
|
||||
end := w.pos + int64(len(p))
|
||||
if end > int64(len(w.buf)) {
|
||||
w.buf = append(w.buf, make([]byte, end-int64(len(w.buf)))...)
|
||||
}
|
||||
n := copy(w.buf[w.pos:], p)
|
||||
w.pos = end
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (w *bytesWriterSeeker) Seek(offset int64, whence int) (int64, error) {
|
||||
switch whence {
|
||||
case io.SeekStart:
|
||||
w.pos = offset
|
||||
case io.SeekCurrent:
|
||||
w.pos += offset
|
||||
case io.SeekEnd:
|
||||
w.pos = int64(len(w.buf)) + offset
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown whence %d", whence)
|
||||
}
|
||||
return w.pos, nil
|
||||
}
|
||||
|
||||
// writeDataframeOut dispatches the encoded Arrow bytes to wherever --dataframe-out
|
||||
// points: `-` → process stdout, `@<path>` or plain path → local file. Symmetric
|
||||
// with readDataframeBytes on the input side: same `@` tolerance, same TrimPrefix
|
||||
// semantics, and an absolute path will still get rejected by FileIO's SafePath.
|
||||
func writeDataframeOut(rctx *common.RuntimeContext, raw string, data []byte) error {
|
||||
if raw == "-" {
|
||||
out := rctx.IO()
|
||||
if out == nil || out.Out == nil {
|
||||
return common.FlagErrorf("--dataframe-out: stdout is not available")
|
||||
}
|
||||
if _, err := out.Out.Write(data); err != nil {
|
||||
return fmt.Errorf("--dataframe-out: write stdout: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
path := strings.TrimPrefix(raw, "@")
|
||||
fio := rctx.FileIO()
|
||||
if fio == nil {
|
||||
return common.FlagErrorf("--dataframe-out: file output is not available in this context")
|
||||
}
|
||||
// FileIO.Save validates the path via SafeOutputPath (the same sandbox
|
||||
// readDataframeBytes hits on the input side) and writes atomically, so we
|
||||
// don't need an extra ValidatePath call here.
|
||||
if _, err := fio.Save(path, fileio.SaveOptions{ContentLength: int64(len(data))}, bytes.NewReader(data)); err != nil {
|
||||
return fmt.Errorf("--dataframe-out: write %q: %v", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
378
shortcuts/sheets/lark_sheet_dataframe_test.go
Normal file
378
shortcuts/sheets/lark_sheet_dataframe_test.go
Normal file
@@ -0,0 +1,378 @@
|
||||
// Copyright (c) 2026 Lark Technologies Pte. Ltd.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package sheets
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/apache/arrow/go/v17/arrow"
|
||||
"github.com/apache/arrow/go/v17/arrow/array"
|
||||
"github.com/apache/arrow/go/v17/arrow/ipc"
|
||||
"github.com/apache/arrow/go/v17/arrow/memory"
|
||||
)
|
||||
|
||||
// buildArrowIPC writes one record into a Feather v2 (Arrow IPC file) blob.
|
||||
// Used by the round-trip tests below to stand in for what
|
||||
// `pandas.DataFrame.to_feather(path)` would produce; saves the tests from
|
||||
// depending on a pandas-shaped fixture file.
|
||||
//
|
||||
// ipc.NewFileWriter wants an io.WriteSeeker (it back-patches a footer
|
||||
// offset), so we write to a temp file and read the bytes back — simpler than
|
||||
// re-implementing a seekable in-memory buffer.
|
||||
func buildArrowIPC(t *testing.T, schema *arrow.Schema, build func(b *array.RecordBuilder)) []byte {
|
||||
t.Helper()
|
||||
mem := memory.NewGoAllocator()
|
||||
rb := array.NewRecordBuilder(mem, schema)
|
||||
defer rb.Release()
|
||||
build(rb)
|
||||
rec := rb.NewRecord()
|
||||
defer rec.Release()
|
||||
|
||||
path := filepath.Join(t.TempDir(), "df.arrow")
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
t.Fatalf("create temp arrow file: %v", err)
|
||||
}
|
||||
w, err := ipc.NewFileWriter(f, ipc.WithSchema(schema), ipc.WithAllocator(mem))
|
||||
if err != nil {
|
||||
f.Close()
|
||||
t.Fatalf("ipc.NewFileWriter: %v", err)
|
||||
}
|
||||
if err := w.Write(rec); err != nil {
|
||||
t.Fatalf("write record: %v", err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatalf("close writer: %v", err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
t.Fatalf("close file: %v", err)
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read temp arrow file: %v", err)
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
// TestDataframe_RoundTripCoreTypes pins down the Arrow-schema → internal
|
||||
// (type, format) mapping and the per-cell value shape that buildTypedCell
|
||||
// expects: number cells are json.Number (precision-preserving), date cells
|
||||
// are `yyyy-mm-dd` strings, bool/string come through verbatim. Numbers, dates,
|
||||
// strings, bools, and nulls all in one record so a future Arrow-Go bump can't
|
||||
// quietly regress any one family.
|
||||
func TestDataframe_RoundTripCoreTypes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
schema := arrow.NewSchema([]arrow.Field{
|
||||
{Name: "name", Type: arrow.BinaryTypes.String},
|
||||
{Name: "qty", Type: arrow.PrimitiveTypes.Int64},
|
||||
{Name: "price", Type: arrow.PrimitiveTypes.Float64, Metadata: arrow.NewMetadata(
|
||||
[]string{"number_format"}, []string{"$#,##0.00"},
|
||||
)},
|
||||
{Name: "active", Type: arrow.FixedWidthTypes.Boolean},
|
||||
{Name: "shipped_on", Type: arrow.FixedWidthTypes.Date32},
|
||||
}, nil)
|
||||
|
||||
jan15 := arrow.Date32FromTime(time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC))
|
||||
feb02 := arrow.Date32FromTime(time.Date(2024, 2, 2, 0, 0, 0, 0, time.UTC))
|
||||
|
||||
buf := buildArrowIPC(t, schema, func(b *array.RecordBuilder) {
|
||||
b.Field(0).(*array.StringBuilder).AppendValues([]string{"alice", ""}, []bool{true, false})
|
||||
b.Field(1).(*array.Int64Builder).AppendValues([]int64{42, 0}, []bool{true, false})
|
||||
b.Field(2).(*array.Float64Builder).AppendValues([]float64{19.95, 0}, []bool{true, false})
|
||||
b.Field(3).(*array.BooleanBuilder).AppendValues([]bool{true, false}, []bool{true, true})
|
||||
b.Field(4).(*array.Date32Builder).AppendValues([]arrow.Date32{jan15, feb02}, []bool{true, true})
|
||||
})
|
||||
|
||||
spec, err := decodeArrowToSheet(buf, "S1")
|
||||
if err != nil {
|
||||
t.Fatalf("decodeArrowToSheet: %v", err)
|
||||
}
|
||||
if spec.Name != "S1" {
|
||||
t.Errorf("sheet name = %q, want S1", spec.Name)
|
||||
}
|
||||
if len(spec.Columns) != 5 {
|
||||
t.Fatalf("got %d columns, want 5", len(spec.Columns))
|
||||
}
|
||||
want := []struct{ typ, format string }{
|
||||
{"string", "@"},
|
||||
{"number", ""},
|
||||
{"number", "$#,##0.00"},
|
||||
{"bool", ""},
|
||||
{"date", "yyyy-mm-dd"},
|
||||
}
|
||||
for i, w := range want {
|
||||
if spec.Columns[i].Type != w.typ {
|
||||
t.Errorf("columns[%d].Type = %q, want %q", i, spec.Columns[i].Type, w.typ)
|
||||
}
|
||||
if spec.Columns[i].Format != w.format {
|
||||
t.Errorf("columns[%d].Format = %q, want %q", i, spec.Columns[i].Format, w.format)
|
||||
}
|
||||
}
|
||||
|
||||
if len(spec.Rows) != 2 {
|
||||
t.Fatalf("got %d rows, want 2", len(spec.Rows))
|
||||
}
|
||||
// Row 0: every field present, types match what buildTypedCell will accept.
|
||||
row0 := spec.Rows[0]
|
||||
if row0[0] != "alice" {
|
||||
t.Errorf("row0[name] = %#v, want \"alice\"", row0[0])
|
||||
}
|
||||
if n, ok := row0[1].(json.Number); !ok || n.String() != "42" {
|
||||
t.Errorf("row0[qty] = %#v, want json.Number(\"42\")", row0[1])
|
||||
}
|
||||
if n, ok := row0[2].(json.Number); !ok || n.String() != "19.95" {
|
||||
t.Errorf("row0[price] = %#v, want json.Number(\"19.95\")", row0[2])
|
||||
}
|
||||
if row0[3] != true {
|
||||
t.Errorf("row0[active] = %#v, want true", row0[3])
|
||||
}
|
||||
if row0[4] != "2024-01-15" {
|
||||
t.Errorf("row0[shipped_on] = %#v, want \"2024-01-15\"", row0[4])
|
||||
}
|
||||
|
||||
// Row 1: nulls on name/qty/price (despite the buffer values) must become nil
|
||||
// so buildTypedCell paints an empty cell that still carries number_format.
|
||||
row1 := spec.Rows[1]
|
||||
for _, c := range []int{0, 1, 2} {
|
||||
if row1[c] != nil {
|
||||
t.Errorf("row1[%d] = %#v, want nil (null in arrow)", c, row1[c])
|
||||
}
|
||||
}
|
||||
if row1[3] != false {
|
||||
t.Errorf("row1[active] = %#v, want false", row1[3])
|
||||
}
|
||||
if row1[4] != "2024-02-02" {
|
||||
t.Errorf("row1[shipped_on] = %#v, want \"2024-02-02\"", row1[4])
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_Timestamp pins the timestamp → date conversion for the
|
||||
// timestamp[us] case (pandas default for `pd.Timestamp` columns once written
|
||||
// via `to_feather`). Only the calendar date matters for our `yyyy-mm-dd`
|
||||
// landing — guard against TZ drift from the wrong unit pick.
|
||||
func TestDataframe_Timestamp(t *testing.T) {
|
||||
t.Parallel()
|
||||
schema := arrow.NewSchema([]arrow.Field{
|
||||
{Name: "ts", Type: &arrow.TimestampType{Unit: arrow.Microsecond}},
|
||||
}, nil)
|
||||
ts := arrow.Timestamp(time.Date(2024, 6, 12, 14, 30, 0, 0, time.UTC).UnixMicro())
|
||||
buf := buildArrowIPC(t, schema, func(b *array.RecordBuilder) {
|
||||
b.Field(0).(*array.TimestampBuilder).AppendValues([]arrow.Timestamp{ts}, []bool{true})
|
||||
})
|
||||
spec, err := decodeArrowToSheet(buf, "S")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if spec.Columns[0].Type != "date" {
|
||||
t.Errorf("type = %q, want date", spec.Columns[0].Type)
|
||||
}
|
||||
if got := spec.Rows[0][0]; got != "2024-06-12" {
|
||||
t.Errorf("ts = %#v, want \"2024-06-12\"", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_EmptySchema rejects an Arrow file whose schema has no fields:
|
||||
// a 0-column "DataFrame" would write a header-less, data-less block that
|
||||
// validates as "writer ran successfully" but produces nothing — the test ties
|
||||
// that off as an explicit error rather than letting it slip through.
|
||||
func TestDataframe_EmptySchema(t *testing.T) {
|
||||
t.Parallel()
|
||||
schema := arrow.NewSchema(nil, nil)
|
||||
buf := buildArrowIPC(t, schema, func(b *array.RecordBuilder) {})
|
||||
_, err := decodeArrowToSheet(buf, "S")
|
||||
if err == nil || !strings.Contains(err.Error(), "no fields") {
|
||||
t.Errorf("err = %v, want 'no fields' error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_DuplicateColumn catches duplicate-name columns at decode
|
||||
// time. Validate already rejects duplicate column names for the JSON path;
|
||||
// the Arrow path mirrors that so the error surfaces with the same shape.
|
||||
func TestDataframe_DuplicateColumn(t *testing.T) {
|
||||
t.Parallel()
|
||||
schema := arrow.NewSchema([]arrow.Field{
|
||||
{Name: "x", Type: arrow.BinaryTypes.String},
|
||||
{Name: "x", Type: arrow.PrimitiveTypes.Int64},
|
||||
}, nil)
|
||||
buf := buildArrowIPC(t, schema, func(b *array.RecordBuilder) {
|
||||
b.Field(0).(*array.StringBuilder).Append("")
|
||||
b.Field(1).(*array.Int64Builder).Append(0)
|
||||
})
|
||||
_, err := decodeArrowToSheet(buf, "S")
|
||||
if err == nil || !strings.Contains(err.Error(), "duplicate") {
|
||||
t.Errorf("err = %v, want duplicate-column error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_BadBytes rejects a non-Arrow blob with a hint pointing at
|
||||
// pandas df.to_feather so users see what producer is expected without having
|
||||
// to grep the docs.
|
||||
func TestDataframe_BadBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
_, err := decodeArrowToSheet([]byte("not arrow"), "S")
|
||||
if err == nil || !strings.Contains(err.Error(), "Arrow") {
|
||||
t.Errorf("err = %v, want Arrow-decode error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_EncodeRoundTrip checks --dataframe-out's encoder against its
|
||||
// own decoder: build a +table-get-shaped sheet map (the same one
|
||||
// readSheetAsSpec emits), encode to Arrow IPC, decode back via the put-side
|
||||
// decoder, and require the column types / formats / row values to match. If
|
||||
// any encoder choice drifts from what the decoder expects, the round-trip
|
||||
// breaks here long before a real put → get round-trip in production would.
|
||||
func TestDataframe_EncodeRoundTrip(t *testing.T) {
|
||||
t.Parallel()
|
||||
sheet := map[string]interface{}{
|
||||
"name": "S1",
|
||||
"columns": []interface{}{"name", "qty", "price", "active", "ts"},
|
||||
"dtypes": map[string]interface{}{
|
||||
"name": "object",
|
||||
"qty": "float64",
|
||||
"price": "float64",
|
||||
"active": "bool",
|
||||
"ts": "datetime64[ns]",
|
||||
},
|
||||
"formats": map[string]interface{}{
|
||||
// `@` is the writer convention for string columns; readSheetAsSpec
|
||||
// strips it via isTextNumberFormat, so an Arrow file built from a
|
||||
// real read won't carry @ either. Keep it absent here to mirror
|
||||
// the production wire shape.
|
||||
"price": "$#,##0.00",
|
||||
},
|
||||
"data": []interface{}{
|
||||
[]interface{}{"alice", json.Number("42"), json.Number("19.95"), true, "2024-01-15"},
|
||||
[]interface{}{"bob", nil, json.Number("8.5"), false, "2024-02-02"},
|
||||
},
|
||||
}
|
||||
blob, err := encodeSheetMapToArrowIPC(sheet)
|
||||
if err != nil {
|
||||
t.Fatalf("encodeSheetMapToArrowIPC: %v", err)
|
||||
}
|
||||
spec, err := decodeArrowToSheet(blob, "S1")
|
||||
if err != nil {
|
||||
t.Fatalf("decodeArrowToSheet: %v", err)
|
||||
}
|
||||
wantTypes := []string{"string", "number", "number", "bool", "date"}
|
||||
wantFormats := []string{"@", "", "$#,##0.00", "", "yyyy-mm-dd"}
|
||||
if len(spec.Columns) != len(wantTypes) {
|
||||
t.Fatalf("got %d columns, want %d", len(spec.Columns), len(wantTypes))
|
||||
}
|
||||
for i, w := range wantTypes {
|
||||
if spec.Columns[i].Type != w {
|
||||
t.Errorf("columns[%d].Type = %q, want %q", i, spec.Columns[i].Type, w)
|
||||
}
|
||||
if spec.Columns[i].Format != wantFormats[i] {
|
||||
t.Errorf("columns[%d].Format = %q, want %q", i, spec.Columns[i].Format, wantFormats[i])
|
||||
}
|
||||
}
|
||||
if len(spec.Rows) != 2 {
|
||||
t.Fatalf("got %d rows, want 2", len(spec.Rows))
|
||||
}
|
||||
if spec.Rows[0][0] != "alice" {
|
||||
t.Errorf("row0[name] = %#v, want alice", spec.Rows[0][0])
|
||||
}
|
||||
if n, ok := spec.Rows[0][1].(json.Number); !ok || n.String() != "42" {
|
||||
t.Errorf("row0[qty] = %#v, want json.Number(\"42\")", spec.Rows[0][1])
|
||||
}
|
||||
if spec.Rows[0][3] != true {
|
||||
t.Errorf("row0[active] = %#v, want true", spec.Rows[0][3])
|
||||
}
|
||||
if spec.Rows[0][4] != "2024-01-15" {
|
||||
t.Errorf("row0[ts] = %#v, want 2024-01-15", spec.Rows[0][4])
|
||||
}
|
||||
// qty is null on row1, must come back as nil (not a zero-valued
|
||||
// json.Number that would later round-trip as 0).
|
||||
if spec.Rows[1][1] != nil {
|
||||
t.Errorf("row1[qty] = %#v, want nil (null arrow cell)", spec.Rows[1][1])
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_EncodeAcceptsBothRowShapes pins the encoder against the two
|
||||
// shapes `sheet["data"]` actually arrives in: `[][]interface{}` from a live
|
||||
// readSheetAsSpec call (production), and `[]interface{}` from a JSON
|
||||
// unmarshal (round-trip / fixtures). Either must produce non-empty Arrow
|
||||
// output — early on the production shape silently fell through the
|
||||
// `[]interface{}` type assertion and we shipped a 0-row Arrow blob.
|
||||
func TestDataframe_EncodeAcceptsBothRowShapes(t *testing.T) {
|
||||
t.Parallel()
|
||||
base := func(data interface{}) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"name": "S",
|
||||
"columns": []interface{}{"city"},
|
||||
"dtypes": map[string]interface{}{"city": "object"},
|
||||
"data": data,
|
||||
}
|
||||
}
|
||||
for label, data := range map[string]interface{}{
|
||||
"production [][]interface{}": [][]interface{}{{"BJ"}, {"SH"}},
|
||||
"unmarshal []interface{}": []interface{}{[]interface{}{"BJ"}, []interface{}{"SH"}},
|
||||
} {
|
||||
blob, err := encodeSheetMapToArrowIPC(base(data))
|
||||
if err != nil {
|
||||
t.Errorf("%s: encode: %v", label, err)
|
||||
continue
|
||||
}
|
||||
spec, err := decodeArrowToSheet(blob, "S")
|
||||
if err != nil {
|
||||
t.Errorf("%s: decode: %v", label, err)
|
||||
continue
|
||||
}
|
||||
if len(spec.Rows) != 2 {
|
||||
t.Errorf("%s: got %d rows, want 2", label, len(spec.Rows))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_DtypeToInternalType pins the inverse of typeToDtype so
|
||||
// readSheetAsSpec's dtype labels recover the right internal type. Covers the
|
||||
// dtype families +table-get emits today plus the safe fallback for unknown
|
||||
// labels (string, lossless).
|
||||
func TestDataframe_DtypeToInternalType(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := map[string]string{
|
||||
"float64": "number",
|
||||
"int64": "number",
|
||||
"Int64": "number",
|
||||
"bool": "bool",
|
||||
"boolean": "bool",
|
||||
"datetime64[ns]": "date",
|
||||
"datetime64[ms]": "date",
|
||||
"object": "string",
|
||||
"": "string",
|
||||
"weird-new-dtype": "string",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := dtypeToInternalType(in); got != want {
|
||||
t.Errorf("dtypeToInternalType(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestDataframe_BytesWriterSeeker confirms the in-memory WriteSeeker handles
|
||||
// the Seek-and-overwrite pattern ipc.NewFileWriter uses to patch the footer
|
||||
// offset: write some bytes, seek back to the middle, overwrite, end up with
|
||||
// the buffer reflecting the overwritten bytes (not a tail-extended duplicate).
|
||||
func TestDataframe_BytesWriterSeeker(t *testing.T) {
|
||||
t.Parallel()
|
||||
var w bytesWriterSeeker
|
||||
if _, err := w.Write([]byte("hello world")); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := w.Seek(6, 0); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := w.Write([]byte("WORLD")); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := string(w.buf); got != "hello WORLD" {
|
||||
t.Errorf("buf = %q, want \"hello WORLD\"", got)
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -52,7 +53,7 @@ var TablePut = common.Shortcut{
|
||||
if _, err := resolveSpreadsheetToken(runtime); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := parseTablePutPayload(runtime)
|
||||
_, err := resolveTablePayload(runtime)
|
||||
return err
|
||||
},
|
||||
DryRun: func(ctx context.Context, runtime *common.RuntimeContext) *common.DryRunAPI {
|
||||
@@ -63,7 +64,7 @@ var TablePut = common.Shortcut{
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
payload, err := parseTablePutPayload(runtime)
|
||||
payload, err := resolveTablePayload(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -72,9 +73,32 @@ var TablePut = common.Shortcut{
|
||||
Tips: []string{
|
||||
"Writes into an existing spreadsheet — pass --url or --spreadsheet-token. To create a new workbook first, use +workbook-create, then point --spreadsheet-token here.",
|
||||
"Payload sheets are matched to existing sub-sheets by name (created when absent). Date columns take ISO yyyy-mm-dd strings — converted to real dates (serial + date format).",
|
||||
"Two equivalent producers: --sheets (multi-sheet JSON, the pandas-split convention) or --dataframe (single-sheet Arrow IPC binary, what `df.to_feather()` writes). Mutually exclusive; pick by what your producer already emits.",
|
||||
},
|
||||
}
|
||||
|
||||
// resolveTablePayload picks between --sheets (JSON, multi-sheet) and
|
||||
// --dataframe (Arrow IPC, single-sheet), enforces XOR, and returns the
|
||||
// unified internal tablePayload. Both +table-put and +workbook-create funnel
|
||||
// through here so the two entry points stay in lockstep; Validate / Execute /
|
||||
// DryRun / workbookCreateData all share this one decision. Network-free.
|
||||
func resolveTablePayload(rctx *common.RuntimeContext) (*tablePayload, error) {
|
||||
sheetsGiven := rctx.Changed("sheets") && strings.TrimSpace(rctx.Str("sheets")) != ""
|
||||
dfGiven := rctx.Changed("dataframe") && strings.TrimSpace(rctx.Str("dataframe")) != ""
|
||||
if sheetsGiven && dfGiven {
|
||||
return nil, common.FlagErrorf("--sheets and --dataframe are mutually exclusive")
|
||||
}
|
||||
if !sheetsGiven && !dfGiven {
|
||||
// Mirror the original "--sheets is required" message but list both
|
||||
// alternatives so users discover the binary entry from the error.
|
||||
return nil, common.FlagErrorf("one of --sheets or --dataframe is required")
|
||||
}
|
||||
if dfGiven {
|
||||
return parseDataframePayload(rctx)
|
||||
}
|
||||
return parseTablePutPayload(rctx)
|
||||
}
|
||||
|
||||
// ─── protocol ─────────────────────────────────────────────────────────
|
||||
|
||||
type tablePayload struct {
|
||||
@@ -601,6 +625,16 @@ func writeSheetData(ctx context.Context, runtime *common.RuntimeContext, token,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Grow the sub-sheet to fit the write block before the first batch.
|
||||
// Without this, writes past the sheet's initial dimensions (typically the
|
||||
// backend default of 200 rows × 20 cols — which also covers
|
||||
// `+workbook-create`'s adopted Sheet1) fail with [900015206] range exceeds
|
||||
// sheet bounds. Best-effort: if reading dims fails the downstream write
|
||||
// will surface the same out-of-bounds error it did before this helper.
|
||||
if err := ensureSheetCapacity(ctx, runtime, token, sheetID, baseRow+len(matrix), col0+ncols); err != nil {
|
||||
return nil, fmt.Errorf("ensuring sheet capacity: %w", err)
|
||||
}
|
||||
|
||||
startCol := columnIndexToLetter(col0)
|
||||
endCol := columnIndexToLetter(col0 + ncols - 1)
|
||||
allowOverwrite := s.AllowOverwrite == nil || *s.AllowOverwrite
|
||||
@@ -819,6 +853,91 @@ func sheetCreateDims(s *tableSheetSpec) (rows, cols int) {
|
||||
return rows, cols
|
||||
}
|
||||
|
||||
// ensureSheetCapacity grows the sub-sheet's row / column count enough to
|
||||
// fit a needRows × needCols write block before the next set_cell_range call.
|
||||
// Both +table-put writing into an existing sheet *and* +workbook-create's
|
||||
// adopted default Sheet1 inherit the backend's 200×20 default — anything
|
||||
// past that would error with `[900015206] range exceeds sheet bounds`.
|
||||
// Read failures (e.g. mock stubs without `row_count`) silently fall through;
|
||||
// the downstream write surfaces the same out-of-bounds error it did before,
|
||||
// so the helper can't make things worse. Backend hard ceilings (50000 rows,
|
||||
// 200 cols) are honored.
|
||||
func ensureSheetCapacity(ctx context.Context, runtime *common.RuntimeContext, token, sheetID string, needRows, needCols int) error {
|
||||
if needRows <= 0 && needCols <= 0 {
|
||||
return nil
|
||||
}
|
||||
out, err := callTool(ctx, runtime, token, ToolKindRead, "get_sheet_structure", map[string]interface{}{
|
||||
"excel_id": token,
|
||||
"sheet_id": sheetID,
|
||||
})
|
||||
if err != nil {
|
||||
// best-effort: skip if we can't see current dims (mock without stub,
|
||||
// permissions, transient failure). The write below will still bounce
|
||||
// if the sheet really is too small, so degrading silently here is
|
||||
// fail-open by design.
|
||||
return nil //nolint:nilerr
|
||||
}
|
||||
m, _ := out.(map[string]interface{})
|
||||
// get_sheet_structure reports current dims as the `range` field
|
||||
// (e.g. "A1:T200" → 20 cols × 200 rows). splitCellRef parses the
|
||||
// bottom-right corner into 0-based (col, row).
|
||||
curRows, curCols := 0, 0
|
||||
if rng, _ := m["range"].(string); rng != "" {
|
||||
parts := strings.SplitN(rng, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
if c, r, ok := splitCellRef(parts[1]); ok {
|
||||
curCols = c + 1
|
||||
curRows = r + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
if needRows > curRows && curRows > 0 {
|
||||
target := needRows
|
||||
if target > 50000 {
|
||||
target = 50000
|
||||
}
|
||||
if target > curRows {
|
||||
// position is 1-based and must be ≤ curRows (the backend rejects
|
||||
// "before row N+1" as out-of-range). Inserting before the last
|
||||
// existing row pushes that row down and effectively appends
|
||||
// `count` blank rows — the data writes that follow will overwrite
|
||||
// the existing rows from row 1, so the placement of the inserted
|
||||
// blanks doesn't matter as long as the total dimension grows.
|
||||
input := map[string]interface{}{
|
||||
"excel_id": token,
|
||||
"sheet_id": sheetID,
|
||||
"operation": "insert",
|
||||
"position": strconv.Itoa(curRows),
|
||||
"count": target - curRows,
|
||||
}
|
||||
if _, err := callTool(ctx, runtime, token, ToolKindWrite, "modify_sheet_structure", input); err != nil {
|
||||
return fmt.Errorf("growing rows %d → %d: %w", curRows, target, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if needCols > curCols && curCols > 0 {
|
||||
target := needCols
|
||||
if target > 200 {
|
||||
target = 200
|
||||
}
|
||||
if target > curCols {
|
||||
// Same 1-based, ≤-current constraint as rows: insert before the
|
||||
// last existing column letter.
|
||||
input := map[string]interface{}{
|
||||
"excel_id": token,
|
||||
"sheet_id": sheetID,
|
||||
"operation": "insert",
|
||||
"position": columnIndexToLetter(curCols - 1),
|
||||
"count": target - curCols,
|
||||
}
|
||||
if _, err := callTool(ctx, runtime, token, ToolKindWrite, "modify_sheet_structure", input); err != nil {
|
||||
return fmt.Errorf("growing cols %d → %d: %w", curCols, target, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// listSheetIDsByName maps every existing sub-sheet's display name to its id via
|
||||
// a single get_workbook_structure read. Used by write mode to decide which
|
||||
// payload sheets already exist.
|
||||
@@ -894,7 +1013,7 @@ func tablePutPartial(token string, spreadsheet interface{}, written []interface{
|
||||
// by Validate, so errors here degrade to an empty preview rather than twice.
|
||||
func tablePutDryRun(runtime *common.RuntimeContext) *common.DryRunAPI {
|
||||
dry := common.NewDryRunAPI()
|
||||
payload, err := parseTablePutPayload(runtime)
|
||||
payload, err := resolveTablePayload(runtime)
|
||||
if err != nil {
|
||||
return dry
|
||||
}
|
||||
@@ -951,6 +1070,15 @@ var TableGet = common.Shortcut{
|
||||
if strings.TrimSpace(runtime.Str("sheet-id")) != "" && strings.TrimSpace(runtime.Str("sheet-name")) != "" {
|
||||
return common.FlagErrorf("--sheet-id and --sheet-name are mutually exclusive")
|
||||
}
|
||||
// --dataframe-out is Arrow IPC, which carries one schema per file — a
|
||||
// whole-workbook read can't ride that shape. Surface the constraint
|
||||
// before we round-trip to the API instead of after the read fails to
|
||||
// encode.
|
||||
if strings.TrimSpace(runtime.Str("dataframe-out")) != "" {
|
||||
if strings.TrimSpace(runtime.Str("sheet-id")) == "" && strings.TrimSpace(runtime.Str("sheet-name")) == "" {
|
||||
return common.FlagErrorf("--dataframe-out requires --sheet-id or --sheet-name (single-sheet only); for the whole workbook, drop --dataframe-out and use the default JSON output")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
},
|
||||
DryRun: func(ctx context.Context, runtime *common.RuntimeContext) *common.DryRunAPI {
|
||||
@@ -992,12 +1120,38 @@ var TableGet = common.Shortcut{
|
||||
}
|
||||
sheets = append(sheets, spec)
|
||||
}
|
||||
// Arrow IPC binary branch: Validate already guards single-sheet so the
|
||||
// sheets slice has exactly one entry here. Stdout mode owns stdout for
|
||||
// the binary stream — no envelope, raw Arrow only. File mode writes
|
||||
// the Arrow blob to disk and still emits the lark-cli JSON envelope
|
||||
// (output_path / bytes / sheet_name) so scripted callers can detect
|
||||
// success the same way they do for every other shortcut.
|
||||
if dfOut := strings.TrimSpace(runtime.Str("dataframe-out")); dfOut != "" {
|
||||
spec, _ := sheets[0].(map[string]interface{})
|
||||
data, err := encodeSheetMapToArrowIPC(spec)
|
||||
if err != nil {
|
||||
return common.FlagErrorf("--dataframe-out: encode arrow: %v", err)
|
||||
}
|
||||
if err := writeDataframeOut(runtime, dfOut, data); err != nil {
|
||||
return err
|
||||
}
|
||||
if dfOut == "-" {
|
||||
return nil
|
||||
}
|
||||
runtime.Out(map[string]interface{}{
|
||||
"output_path": strings.TrimPrefix(dfOut, "@"),
|
||||
"bytes": len(data),
|
||||
"sheet_name": spec["name"],
|
||||
}, nil)
|
||||
return nil
|
||||
}
|
||||
runtime.Out(map[string]interface{}{"sheets": sheets}, nil)
|
||||
return nil
|
||||
},
|
||||
Tips: []string{
|
||||
"Output is the same shape +table-put consumes — pipe it back in, or load sheets[].rows into a DataFrame keyed by columns[].name.",
|
||||
"Column types are inferred per column, but only when every non-empty cell agrees; a column mixing types (e.g. numbers + \"暂无\") degrades to string — lossless and round-trips cleanly. Numeric coercion of dirty cells is the caller's job (pandas to_numeric(errors=\"coerce\") on the string column).",
|
||||
"For a pandas round-trip, use --dataframe-out (single sheet, Arrow IPC / Feather v2) — `@./x.arrow` writes a file, `-` streams binary to stdout for `pd.read_feather(BytesIO(stdout))`. Multi-sheet reads stay on the JSON path.",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -571,17 +571,23 @@ var WorkbookCreate = common.Shortcut{
|
||||
if strings.TrimSpace(runtime.Str("title")) == "" {
|
||||
return common.FlagErrorf("--title is required")
|
||||
}
|
||||
// --sheets (typed) is an alternative, mutually exclusive data entry to the
|
||||
// untyped --values. Gated on Changed (not just non-empty): an explicitly-
|
||||
// given but empty --sheets (e.g. empty stdin / file) is an error, not a
|
||||
// silent fall-through to creating an empty workbook.
|
||||
if runtime.Changed("sheets") {
|
||||
// --sheets (typed JSON) and --dataframe (typed Arrow IPC) are two
|
||||
// alternative typed data entries; both are mutually exclusive with
|
||||
// the untyped --values. Gating on Changed (not just non-empty) catches
|
||||
// an explicitly-given but empty payload as an error instead of letting
|
||||
// it fall through to creating an empty workbook.
|
||||
sheetsGiven := runtime.Changed("sheets")
|
||||
dfGiven := runtime.Changed("dataframe")
|
||||
if sheetsGiven && dfGiven {
|
||||
return common.FlagErrorf("--sheets and --dataframe are mutually exclusive")
|
||||
}
|
||||
if (sheetsGiven || dfGiven) && runtime.Str("values") != "" {
|
||||
return common.FlagErrorf("--values is mutually exclusive with --sheets/--dataframe")
|
||||
}
|
||||
if sheetsGiven {
|
||||
if strings.TrimSpace(runtime.Str("sheets")) == "" {
|
||||
return common.FlagErrorf("--sheets was given but resolved to empty (empty stdin/file?); pass a typed payload, or drop --sheets to create an empty workbook")
|
||||
}
|
||||
if runtime.Str("values") != "" {
|
||||
return common.FlagErrorf("--sheets is mutually exclusive with --values")
|
||||
}
|
||||
payload, err := parseTablePutPayload(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -589,6 +595,17 @@ var WorkbookCreate = common.Shortcut{
|
||||
_, err = parseWorkbookCreateSheetStyles(runtime, payload)
|
||||
return err
|
||||
}
|
||||
if dfGiven {
|
||||
if strings.TrimSpace(runtime.Str("dataframe")) == "" {
|
||||
return common.FlagErrorf("--dataframe was given but resolved to empty; pass a path to an Arrow IPC file, or drop --dataframe to create an empty workbook")
|
||||
}
|
||||
payload, err := parseDataframePayload(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = parseWorkbookCreateSheetStyles(runtime, payload)
|
||||
return err
|
||||
}
|
||||
// Untyped --values path: parse (and validate) --styles as a single sheet
|
||||
// style item, then synthesize --values into a type-less typed payload —
|
||||
// the same construction buildValuesPayload runs at execute time, so any
|
||||
@@ -731,6 +748,17 @@ func workbookCreateData(runtime *common.RuntimeContext) (*tablePayload, *workboo
|
||||
}
|
||||
return payload, styles, nil
|
||||
}
|
||||
if runtime.Changed("dataframe") {
|
||||
payload, err := parseDataframePayload(runtime)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
styles, err := parseWorkbookCreateSheetStyles(runtime, payload)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return payload, styles, nil
|
||||
}
|
||||
styles, err := parseValuesSheetStyles(runtime)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
|
||||
@@ -130,6 +130,7 @@ _公共:URL/token(无 sheet 定位) · 系统:`--dry-run`_
|
||||
| `--sheet-name` | string | optional | 只读该子表(按名);省略则读所有子表 |
|
||||
| `--range` | string | optional | 读取的 A1 范围;省略则读每个子表的当前数据区 |
|
||||
| `--no-header` | bool | optional | 把第一行当数据而非表头(列名取 col1/col2 …) |
|
||||
| `--dataframe-out` | string | optional | 以一份 Arrow IPC 文件(Feather v2)格式输出 typed 表格,替代默认的 JSON 输出。用 `@<path>` 传文件或 `-` 写二进制 stdout(同其他 binary I/O flag 的约定)。是 `+table-put` / `+workbook-create` 入口 `--dataframe` 的镜像 —— pandas 端 `pd.read_feather("x.arrow")` 或 `pd.read_feather(io.BytesIO(stdout))` 一行读回。仅支持单 sheet:必须给 `--sheet-id` 或 `--sheet-name`;读整本 workbook 仍走默认 JSON。列类型沿用 typed 读回(string/number/date/bool);`number_format` 以 Arrow Field metadata 保留,Arrow 文件可直接喂回 `+table-put --dataframe`。 |
|
||||
|
||||
## Examples
|
||||
|
||||
@@ -200,6 +201,34 @@ df_sales = sheets["销售"]
|
||||
|
||||
> 显示格式(千分位、百分比、自定义日期)在 `sheet["formats"]`,pandas 不消费;改完数据 round-trip 回去时透传给 `+table-put` 即可,飞书侧显示不变。
|
||||
|
||||
#### `--dataframe-out`(Arrow IPC / Feather v2 二进制读出)
|
||||
|
||||
`--dataframe-out` 是 `+table-put` 入口 `--dataframe` 的镜像:把 typed 读回直接编码成 Arrow IPC 文件,pandas 端一行 `pd.read_feather()` 读回——省掉 JSON 解析 + `astype(dtypes)`,列类型 / `number_format` 走 Arrow schema + Field metadata 保真。**仅支持单 sheet**(Arrow 文件一 schema 容器),必须给 `--sheet-id` 或 `--sheet-name`;读整本 workbook 仍走默认 JSON。
|
||||
|
||||
```bash
|
||||
# 文件
|
||||
lark-cli sheets +table-get --url "<表URL>" --sheet-name "销售" --dataframe-out @./out.arrow
|
||||
# binary stdout(不落盘)
|
||||
lark-cli sheets +table-get --url "<表URL>" --sheet-name "销售" --dataframe-out -
|
||||
```
|
||||
|
||||
```python
|
||||
import io, pandas as pd, subprocess
|
||||
|
||||
# 1) 文件
|
||||
subprocess.run(["lark-cli","sheets","+table-get","--url",URL,
|
||||
"--sheet-name","销售","--dataframe-out","@./out.arrow"], check=True)
|
||||
df = pd.read_feather("./out.arrow")
|
||||
|
||||
# 2) stdin/stdout 管道(不落盘)—— 跟 --dataframe 写入侧对称的一行
|
||||
res = subprocess.run(["lark-cli","sheets","+table-get","--url",URL,
|
||||
"--sheet-name","销售","--dataframe-out","-"],
|
||||
capture_output=True, check=True)
|
||||
df = pd.read_feather(io.BytesIO(res.stdout))
|
||||
```
|
||||
|
||||
> `number_format` 进 Arrow Field metadata(key=`number_format`),Arrow 文件可以直接喂回 `+table-put --dataframe` round-trip 写回,types / formats 一路保真。
|
||||
|
||||
#### round-trip:读 → 改 → 写回(写读对偶)
|
||||
|
||||
`sheet_to_df` 和 write-cells reference 里的 `df_to_sheet` 是一对镜像 helper,round-trip 三段读 / 改 / 写各一行:
|
||||
|
||||
@@ -139,8 +139,9 @@ _系统:`--dry-run`_
|
||||
| `--title` | string | required | 新 spreadsheet 标题 |
|
||||
| `--folder-token` | string | optional | 目标文件夹 token;省略时放在云空间根目录 |
|
||||
| `--values` | string + File + Stdin(简单 JSON) | optional | untyped 初始数据,一个 JSON 二维数组(表头并入第一行):`[["列A","列B"],["alice",95]]`;值原样写入、类型由飞书自动识别,走与 --sheets 相同的分批 `+cells-set`;配 --styles 控制格式/颜色/合并/行列尺寸 |
|
||||
| `--sheets` | string + File + Stdin(复合 JSON) | optional | 建表后写入的 typed 表格协议 JSON(同 +table-put):顶层 sheets 数组,每项 `{name, start_cell?, mode?, header?, allow_overwrite?, columns:["colA","colB",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`。Agents 通常用 `{**json.loads(df.to_json(orient="split")), "dtypes": df.dtypes.astype(str).to_dict()}` 一行构造。与 --values 互斥;新表默认子表复用为第一个子表,日期/数字类型保真。 |
|
||||
| `--sheets` | string + File + Stdin(复合 JSON) | optional | 建表后写入的 typed 表格协议 JSON(同 +table-put):顶层 sheets 数组,每项 `{name, start_cell?, mode?, header?, allow_overwrite?, columns:["colA","colB",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`。Agents 通常用 `{**json.loads(df.to_json(orient="split")), "dtypes": df.dtypes.astype(str).to_dict()}` 一行构造。与 --values、--dataframe 互斥;新表默认子表复用为第一个子表,日期/数字类型保真。 |
|
||||
| `--styles` | string + File + Stdin(复合 JSON) | optional | 建表时同时写入的视觉处理操作 JSON:顶层 `{styles:[...]}`,每项对应一个目标子表、含 `name`,并至少给 `cell_styles` / `row_sizes` / `col_sizes` / `cell_merges` 之一。`cell_styles` 用 A1 单元格 range + 扁平样式字段(字段同 +cells-set-style,含 number_format / 颜色 / 对齐 / border_styles);row/col sizes 用行/列范围 + type/size;merges 用单元格 range + 可选 merge_type。与 --sheets 搭配时 styles 数组长度/顺序/name 必须与 --sheets.sheets 对应;与 --values 搭配时只给一个 styles 项(其 name 忽略)。 |
|
||||
| `--dataframe` | string | optional | 单 sheet 类型保真表格的二进制入口,从一个 Arrow IPC 文件(Feather v2,pandas `df.to_feather()` 直接写出)读入,与 --values / --sheets 互斥。用 `@<path>` 传文件或 `-` 读二进制 stdin(同其他输入 flag 的约定)。Arrow 字节按原样读 —— 不做 TrimSpace / BOM strip,IPC magic 字节完整保留(区别于文本类输入 flag)。列类型从 Arrow schema 推导;每列的 `number_format` 可写在 Arrow Field metadata 里。建表后写入默认子表(`Sheet1` —— 直接复用,不残留空 Sheet1)。要多子表或换落点,请改用 `--sheets`。 |
|
||||
|
||||
### `+workbook-export`
|
||||
|
||||
@@ -199,7 +200,7 @@ _一个或多个子表的 typed 数据,每个数组元素写入一张子表;
|
||||
|
||||
### `+workbook-create`
|
||||
|
||||
新建电子表格,可选预填数据。两种数据入口(untyped `--values` / typed `--sheets`)**互斥**,按需二选一——两者都走同一条分批 `set_cell_range` 写入:
|
||||
新建电子表格,可选预填数据。三种数据入口(untyped `--values` / typed `--sheets` JSON / typed `--dataframe` Arrow 二进制)**三方互斥**,按需选一——两者都走同一条分批 `set_cell_range` 写入:
|
||||
|
||||
```bash
|
||||
# 1) untyped:--values(一个二维数组,表头并入第一行;值原样写、类型由飞书自动识别,
|
||||
@@ -207,7 +208,7 @@ _一个或多个子表的 typed 数据,每个数组元素写入一张子表;
|
||||
lark-cli sheets +workbook-create --title "销售" \
|
||||
--values '[["门店","销售额"],["北京",259874]]'
|
||||
|
||||
# 2) typed:--sheets(一步建表 + 类型保真)。date 列落成真日期(可排序/透视)、
|
||||
# 2) typed JSON:--sheets(一步建表 + 类型保真)。date 列落成真日期(可排序/透视)、
|
||||
# number 不丢精度、string 列保前导零(如订单号 00123);多子表一次建。
|
||||
lark-cli sheets +workbook-create --title "交易" --sheets '{
|
||||
"sheets":[
|
||||
@@ -217,9 +218,16 @@ lark-cli sheets +workbook-create --title "交易" --sheets '{
|
||||
"formats":{"金额":"#,##0.00"},
|
||||
"data":[["2024-01-15",1234.5,"00123"]]}
|
||||
]}'
|
||||
|
||||
# 3) typed binary:--dataframe(pandas df.to_feather 直接出,Arrow IPC / Feather v2)。
|
||||
# 单子表(落点固定为新表的默认子表,原地复用、不残留空 Sheet1),列类型从 Arrow
|
||||
# schema 自动恢复,无需手填 dtypes/formats;要多子表回到 --sheets。
|
||||
lark-cli sheets +workbook-create --title "交易" --dataframe @./in.arrow
|
||||
# 或走 stdin(不落盘):
|
||||
python prepare.py | lark-cli sheets +workbook-create --title "交易" --dataframe -
|
||||
```
|
||||
|
||||
`--sheets` 协议与 `+table-put` 完全同构(字段含义见 lark-sheets-write-cells 的 `+table-put`,大 payload 走 stdin / `@file`)。关键差异:**新建工作簿的默认子表会被复用为第一个子表**(重命名后承载数据),不会残留空 `Sheet1`;其余子表按需新建。它把 `+table-put` 单独做不到的"建表 + typed 写入"合到一条命令,是「pandas 算完直接落地一张带真日期的新表」的首选。回读校验用 `+table-get`(与 `--sheets` 同构、可 round-trip)。
|
||||
`--sheets` 协议与 `+table-put` 完全同构(字段含义见 lark-sheets-write-cells 的 `+table-put`,大 payload 走 stdin / `@file`);`--dataframe` 是同一份 typed 数据的二进制 wire(Arrow IPC,详见同 reference 的 `+table-put` 段落的 `--dataframe` 小节),按 producer 已有的 API 选——pandas 走 `--dataframe`,多子表 / 手拼 JSON 走 `--sheets`。关键差异:**新建工作簿的默认子表会被复用为第一个子表**(重命名后承载数据),不会残留空 `Sheet1`;其余子表按需新建。它把 `+table-put` 单独做不到的"建表 + typed 写入"合到一条命令,是「pandas 算完直接落地一张带真日期的新表」的首选。回读校验用 `+table-get`(与 `--sheets` 同构、可 round-trip;pandas 用户也可走 `--dataframe-out` 直拿 Arrow 文件)。
|
||||
|
||||
`--styles` 可在建表写入时同时写视觉处理。它和 `--sheets` 一样只有一种外层写法:顶层对象里放 `styles` 数组;数组每项对应一个子表,含 `name`,并按能力拆成四类可选数组:
|
||||
|
||||
|
||||
@@ -317,7 +317,8 @@ _公共:URL/token(无 sheet 定位) · 系统:`--dry-run`_
|
||||
|
||||
| Flag | Type | 必填 | 说明 |
|
||||
| --- | --- | --- | --- |
|
||||
| `--sheets` | string + File + Stdin(复合 JSON) | required | Typed 表格协议(pandas-DataFrame-shaped)JSON:顶层 sheets 数组,每项 `{name, start_cell?, mode?, header?, allow_overwrite?, columns:["colA","colB",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`。Agents 通常用 `{**json.loads(df.to_json(orient="split")), "dtypes": df.dtypes.astype(str).to_dict()}` 一行构造。`dtypes` 值是 pandas dtype 字符串(`int64`、`float64`、`Int64`、`bool`、`boolean`、`datetime64[ns]`、`object`、...),CLI 端映射成内部 string/number/date/bool —— 省略 `dtypes` 时该列按文本写入(适合原始 CSV-shaped 数据)。`formats[col]` 是 Excel number_format 字符串(如 `#,##0.00`、`0.0%`、`yyyy-mm`);缺省时 date 列用 `yyyy-mm-dd`,string 列用文本格式 `@`。 |
|
||||
| `--sheets` | string + File + Stdin(复合 JSON) | xor | Typed 表格协议(pandas-DataFrame-shaped)JSON,与 `--dataframe` 互斥:顶层 sheets 数组,每项 `{name, start_cell?, mode?, header?, allow_overwrite?, columns:["colA","colB",...], data:[[...]], dtypes?:{colA:pandasDtype, ...}, formats?:{colA:numberFormat, ...}}`。Agents 通常用 `{**json.loads(df.to_json(orient="split")), "dtypes": df.dtypes.astype(str).to_dict()}` 一行构造。`dtypes` 值是 pandas dtype 字符串(`int64`、`float64`、`Int64`、`bool`、`boolean`、`datetime64[ns]`、`object`、...),CLI 端映射成内部 string/number/date/bool —— 省略 `dtypes` 时该列按文本写入(适合原始 CSV-shaped 数据)。`formats[col]` 是 Excel number_format 字符串(如 `#,##0.00`、`0.0%`、`yyyy-mm`);缺省时 date 列用 `yyyy-mm-dd`,string 列用文本格式 `@`。 |
|
||||
| `--dataframe` | string | xor | 单 sheet 类型保真表格的二进制入口,从一个 Arrow IPC 文件(即 Feather v2,pandas `df.to_feather()` 直接写出)读入,与 `--sheets` 互斥。用 `@<path>` 传文件或 `-` 读二进制 stdin(同其他输入 flag 的约定)。Arrow 字节按原样读 —— 不做 TrimSpace / BOM strip,IPC magic 字节完整保留(区别于文本类输入 flag)。列类型从 Arrow schema 推导(int*/uint*/float* → number,date32/date64/timestamp → date,utf8/large_utf8 → string,bool → bool);每列的 `number_format` 可写在 Arrow Field metadata 里(`pa.field("price", pa.float64(), metadata={b"number_format": b"$#,##0.00"})`)。子表走默认落点:名为 `Sheet1`(缺则新建),从 A1 起覆盖写并带表头。要换子表名 / 起始位置 / 写入方式,或要写多子表,请改用 `--sheets`。 |
|
||||
|
||||
## Schemas
|
||||
|
||||
@@ -520,6 +521,41 @@ payload = {"sheets": [{
|
||||
|
||||
> **dtype 速查**:`int64`/`float64`(数值)、`Int64`(含空值的整数,nullable)、`bool`/`boolean`、`datetime64[ns]`(date,默认 `yyyy-mm-dd`)、`object`(string)。pandas dtype 字符串原样塞进 dtypes 即可,CLI 端按前缀匹配(`int*`/`uint*`/`Int*`/`float*` → number 等)。未识别 dtype 兜底为 string。
|
||||
|
||||
#### `--dataframe`(Arrow IPC / Feather v2 二进制入口)
|
||||
|
||||
`--dataframe` 与 `--sheets` 互斥、功能等价,但走二进制 wire——pandas `df.to_feather()` 写出的 Arrow IPC 文件直接喂 CLI,类型从 Arrow schema 自动恢复,**不用再手填 dtypes/formats**,也自动绕过 NaT / NaN / `datetime64[ns, tz]` 的 JSON 序列化坑。子表落点固定为 `Sheet1`、A1 起覆盖写、带表头;要换子表名 / 起始位置 / 多子表,回到 `--sheets` JSON 协议。
|
||||
|
||||
```bash
|
||||
# 文件(cwd 相对路径;受 SafePath 沙箱约束,不接受绝对路径)
|
||||
lark-cli sheets +table-put --url "<表URL>" --dataframe @./in.arrow
|
||||
# stdin 二进制(不落盘)
|
||||
python prepare.py | lark-cli sheets +table-put --url "<表URL>" --dataframe -
|
||||
```
|
||||
|
||||
```python
|
||||
import io, subprocess, pandas as pd
|
||||
df = pd.DataFrame({"date": pd.to_datetime(["2024-01-15"]), "amount": [1234.5], "id": ["00123"]})
|
||||
|
||||
# 1) 文件
|
||||
df.to_feather("./in.arrow") # 写到当前目录
|
||||
subprocess.run(["lark-cli","sheets","+table-put","--url",URL,"--dataframe","@./in.arrow"], check=True)
|
||||
|
||||
# 2) stdin(不落盘)—— pandas 写 BytesIO,subprocess 把 buf 灌进去
|
||||
buf = io.BytesIO(); df.to_feather(buf)
|
||||
subprocess.run(["lark-cli","sheets","+table-put","--url",URL,"--dataframe","-"],
|
||||
input=buf.getvalue(), check=True)
|
||||
```
|
||||
|
||||
> 每列的 `number_format` 写在 Arrow Field metadata 里,CLI 端自动透传到飞书显示格式(千分位 / 百分比 / 自定义日期等):
|
||||
> ```python
|
||||
> import pyarrow as pa, pyarrow.feather as feather
|
||||
> table = pa.Table.from_pandas(df)
|
||||
> schema = table.schema.set(
|
||||
> table.schema.get_field_index("amount"),
|
||||
> pa.field("amount", pa.float64(), metadata={b"number_format": b"#,##0.00"}))
|
||||
> feather.write_feather(table.cast(schema), "./in.arrow")
|
||||
> ```
|
||||
|
||||
### Validate / DryRun / Execute 约束
|
||||
|
||||
- `Validate`:XOR 公共四件套;`+cells-set` 的 `--cells` 必须能解析为 JSON 二维矩阵且行列数与 `--range` 完全一致;`+cells-set-style` 的样式 flag 至少一个非空(或带 `--border-styles`);`+cells-set-image` 的 `--range` 必须是单 cell(起止 cell 相同);`+csv-put` 的 `--csv` 必须能按 RFC 4180 解析;防爆参数上限校验。
|
||||
|
||||
Reference in New Issue
Block a user