-
-
Notifications
You must be signed in to change notification settings - Fork 282
Expand file tree
/
Copy pathpickle.hexpat
More file actions
357 lines (338 loc) · 14.7 KB
/
pickle.hexpat
File metadata and controls
357 lines (338 loc) · 14.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*
References:
Pickle Source Code:
https://github.com/python/cpython/blob/main/Lib/pickle.py
Pickle Protocol Version Breakdown:
https://docs.python.org/3.13/library/pickle.html#data-stream-format
Pickle OpCode Breakdown:
https://github.com/python/cpython/blob/main/Lib/pickletools.py
*/
#pragma author ODeux
#pragma description Python Binary Object Serialization Protocol
#pragma endian little
import std.mem;
import std.string;
#pragma array_limit 524288
fn todo(auto message){
std::error(std::format("@0x{:08X} TODO: " + message, $));
};
fn utf8_fmt(auto s){
return std::format("{}", s);
};
#define UTF8_FMT format("utf8_fmt"), transform("utf8_fmt")
fn utf8_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::format("{}", new_s);
};
#define UTF8_RL_FMT format("utf8_rl_fmt"), transform("utf8_rl_fmt")
fn int_rl_fmt(auto s){
if(s == "01\n") return true; /* == TRUE(b'I01\n')[1:] */
if(s == "00\n") return false; /* == FALSE(b'I00\n')[1:] */
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_int(new_s, 0);
};
#define INT_RL_FMT format("int_rl_fmt"), transform("int_rl_fmt")
fn float_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_float(new_s);
};
#define FLOAT_RL_FMT format("float_rl_fmt"), transform("float_rl_fmt")
fn long_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
if(new_s != "" && std::string::at(new_s, std::string::length(new_s) - 1) == "L")
new_s = std::string::substr(new_s, 0, std::string::length(new_s) - 1);
return std::string::parse_int(new_s, 0);
};
#define LONG_RL_FMT format("long_rl_fmt"), transform("long_rl_fmt")
fn ascii_rl_fmt(auto s){
return std::string::substr(s, 0, std::string::length(s) - 1);
};
#define ASCII_RL_FMT format("ascii_rl_fmt"), transform("ascii_rl_fmt")
fn integer_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_int(new_s, 0);
};
#define INTEGER_RL_FMT format("integer_rl_fmt"), transform("integer_rl_fmt")
fn string_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
auto length = std::string::length(new_s);
if(length >= 2 && new_s[0] == new_s[length - 1] && (new_s[0] == '\'' || new_s[0] == '"'))
new_s = std::string::substr(new_s, 1, length - 1);
else std::error("the STRING opcode argument must be quoted");
return new_s;
};
#define STRING_RL_FMT format("string_rl_fmt"), transform("string_rl_fmt")
enum OpcodesEnum: u8{
MARK = '(', /* push special markobject on stack */
STOP = '.', /* every pickle ends with STOP */
POP = '0', /* discard topmost stack item */
POP_MARK = '1', /* discard stack top through topmost markobject */
DUP = '2', /* duplicate top stack item */
FLOAT = 'F', /* push float object; decimal string argument */
INT = 'I', /* push integer or bool; decimal string argument */
BININT = 'J', /* push four-byte signed int */
BININT1 = 'K', /* push 1-byte unsigned int */
LONG = 'L', /* push long; decimal string argument */
BININT2 = 'M', /* push 2-byte unsigned int */
NONE = 'N', /* push None */
PERSID = 'P', /* push persistent object; id is taken from string arg */
BINPERSID = 'Q', /* " " " ; " " " " stack */
REDUCE = 'R', /* apply callable to argtuple, both on stack */
STRING = 'S', /* push string; NL-terminated string argument */
BINSTRING = 'T', /* push string; counted binary string argument */
SHORT_BINSTRING = 'U', /* " " ; " " " " < 256 bytes */
UNICODE = 'V', /* push Unicode string; raw-unicode-escaped'd argument */
BINUNICODE = 'X', /* " " " ; counted UTF-8 string argument */
APPEND = 'a', /* append stack top to list below it */
BUILD = 'b', /* call __setstate__ or __dict__.update() */
GLOBAL = 'c', /* push self.find_class(modname, name); 2 string args */
DICT = 'd', /* build a dict from stack items */
EMPTY_DICT = '}', /* push empty dict */
APPENDS = 'e', /* extend list on stack by topmost stack slice */
GET = 'g', /* push item from memo on stack; index is string arg */
BINGET = 'h', /* " " " " " " ; " " 1-byte arg */
INST = 'i', /* build & push class instance */
LONG_BINGET = 'j', /* push item from memo on stack; index is 4-byte arg */
LIST = 'l', /* build list from topmost stack items */
EMPTY_LIST = ']', /* push empty list */
OBJ = 'o', /* build & push class instance */
PUT = 'p', /* store stack top in memo; index is string arg */
BINPUT = 'q', /* " " " " " ; " " 1-byte arg */
LONG_BINPUT = 'r', /* " " " " " ; " " 4-byte arg */
SETITEM = 's', /* add key+value pair to dict */
TUPLE = 't', /* build tuple from topmost stack items */
EMPTY_TUPLE = ')', /* push empty tuple */
SETITEMS = 'u', /* modify dict by adding topmost key+value pairs */
BINFLOAT = 'G', /* push float; arg is 8-byte float encoding */
/* ---- Protocol 2 ---- */
PROTO = 0x80, /* identify pickle protocol */
NEWOBJ = 0x81, /* build object by applying cls.__new__ to argtuple */
EXT1 = 0x82, /* push object from extension registry; 1-byte index */
EXT2 = 0x83, /* ditto, but 2-byte index */
EXT4 = 0x84, /* ditto, but 4-byte index */
TUPLE1 = 0x85, /* build 1-tuple from stack top */
TUPLE2 = 0x86, /* build 2-tuple from two topmost stack items */
TUPLE3 = 0x87, /* build 3-tuple from three topmost stack items */
NEWTRUE = 0x88, /* push True */
NEWFALSE = 0x89, /* push False */
LONG1 = 0x8A, /* push long from < 256 bytes */
LONG4 = 0x8B, /* push really big long */
/* ---- Protocol 3 (Python 3.x) ---- */
BINBYTES = 'B', /* push bytes; counted binary string argument */
SHORT_BINBYTES = 'C', /* " " ; " " " " < 256 bytes */
/* ---- Protocol 4 ---- */
SHORT_BINUNICODE = 0x8C, /* push short string; UTF-8 length < 256 bytes */
BINUNICODE8 = 0x8D, /* push very long string */
BINBYTES8 = 0x8E, /* push very long bytes string */
EMPTY_SET = 0x8F, /* push empty set on the stack */
ADDITEMS = 0x90, /* modify set by adding topmost stack items */
FROZENSET = 0x91, /* build frozenset from topmost stack items */
NEWOBJ_EX = 0x92, /* like NEWOBJ but work with keyword only arguments */
STACK_GLOBAL = 0x93, /* same as GLOBAL but using names on the stacks */
MEMOIZE = 0x94, /* store top of the stack in memo */
FRAME = 0x95, /* indicate the beginning of a new frame */
/* ---- Protocol 5 ---- */
BYTEARRAY8 = 0x96, /* push bytearray */
NEXT_BUFFER = 0x97, /* push next out-of-band buffer */
READONLY_BUFFER = 0x98 /* make top of stack readonly */
};
fn readline(){
auto i = 0;
while(std::mem::read_unsigned($ + i, 1) != '\n') i += 1;
return i + 1;
};
struct Opcodes{
OpcodesEnum opcode;
match(opcode){
(OpcodesEnum::MARK): {}
(OpcodesEnum::STOP): break;
(OpcodesEnum::POP): {}
(OpcodesEnum::POP_MARK): {}
(OpcodesEnum::DUP): {}
(OpcodesEnum::FLOAT): {
char Float[readline()] [[FLOAT_RL_FMT]]; /* float(readline()[:1]) */
}
(OpcodesEnum::INT): {
/* == TRUE(b'I01\n')[1:], == FALSE(b'I00\n')[1:], int(readline(), 0) */
char Int[readline()] [[INT_RL_FMT]];
}
(OpcodesEnum::BININT): {
s32 Int;
}
(OpcodesEnum::BININT1): {
s8 Int;
}
(OpcodesEnum::LONG): {
/* val = readline()[:-1], val = val and val[-1] == b"L"[0] ? val[:-1]: val */
char Long[readline()] [[LONG_RL_FMT]]; /* int(val, 0) */
}
(OpcodesEnum::BININT2): {
u16 Int;
}
(OpcodesEnum::NONE): {}
(OpcodesEnum::PERSID): {
char id[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
}
(OpcodesEnum::BINPERSID): {}
(OpcodesEnum::REDUCE): {}
/*
def _decode_string(self, value):
# Used to allow strings from Python 2 to be decoded either as bytes or Unicode strings.
# This should be used only with the STRING, BINSTRING and SHORT_BINSTRING opcodes.
if self.encoding == "bytes":
return value
else:
return value.decode(self.encoding, self.errors)
*/
(OpcodesEnum::STRING): {
/* data must be in quotes ("..." or '...'), dataStripped = stripQuote(readline()[:-1]) */
/* _decode_string(codecs.escape_decode(dataStripped)[0]) */
char data[readline()] [[STRING_RL_FMT]];
}
(OpcodesEnum::BINSTRING): {
s32 length;
char data[length]; /* _decode_string(data) */
}
(OpcodesEnum::SHORT_BINSTRING): {
u8 length;
char data[length]; /* _decode_string(data) */
}
(OpcodesEnum::UNICODE): {
/*
"raw-unicode-escape":
Latin-1 encoding with \uXXXX and \UXXXXXXXX for other code points.
Existing backslashes are not escaped in any way.
*/
char data[readline()] [[UTF8_RL_FMT]]; /* str(readline()[:-1], "raw-unicode-escape") */
}
(OpcodesEnum::BINUNICODE): {
u32 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::APPEND): {}
(OpcodesEnum::BUILD): {}
(OpcodesEnum::GLOBAL): {
char module[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
char name[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
}
(OpcodesEnum::DICT): {}
(OpcodesEnum::EMPTY_DICT): {}
(OpcodesEnum::APPENDS): {}
(OpcodesEnum::GET): {
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
}
(OpcodesEnum::BINGET): {
u8 index;
}
(OpcodesEnum::INST): {
char module[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
char name[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
}
(OpcodesEnum::LONG_BINGET): {
u32 index;
}
(OpcodesEnum::LIST): {}
(OpcodesEnum::EMPTY_LIST): {}
(OpcodesEnum::OBJ): {}
(OpcodesEnum::PUT): {
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
}
(OpcodesEnum::BINPUT): {
s8 index;
}
(OpcodesEnum::LONG_BINPUT): {
u32 index;
}
(OpcodesEnum::SETITEM): {}
(OpcodesEnum::TUPLE): {}
(OpcodesEnum::EMPTY_TUPLE): {}
(OpcodesEnum::SETITEMS): {}
(OpcodesEnum::BINFLOAT): {
be double Double;
}
/* ---- Protocol 2 ---- */
(OpcodesEnum::PROTO): {
u8 version;
}
(OpcodesEnum::NEWOBJ): {}
(OpcodesEnum::EXT1): {
u8 code;
}
(OpcodesEnum::EXT2): {
u16 code;
}
(OpcodesEnum::EXT4): {
s32 code;
}
(OpcodesEnum::TUPLE1): {}
(OpcodesEnum::TUPLE2): {}
(OpcodesEnum::TUPLE3): {}
(OpcodesEnum::NEWTRUE): {}
(OpcodesEnum::NEWFALSE): {}
/*
def decode_long(data):
r"""Decode a long from a two's complement little-endian binary string.
>>> decode_long(b"") => 0
>>> decode_long(b"\xff\x00") => 255
>>> decode_long(b"\xff\x7f") => 32767
>>> decode_long(b"\x00\xff") => -256
>>> decode_long(b"\x00\x80") => -32768
>>> decode_long(b"\x80") => -128
>>> decode_long(b"\x7f") => 127
"""
return int.from_bytes(data, byteorder="little", signed=True)
*/
(OpcodesEnum::LONG1): {
u8 length;
u8 data[length]; /* decode_long(data) */
}
(OpcodesEnum::LONG4): {
s32 length;
u8 data[length]; /* decode_long(data) */
}
/* ---- Protocol 3 (Python 3.x) ---- */
(OpcodesEnum::BINBYTES): {
u32 length;
u8 bytes[length];
}
(OpcodesEnum::SHORT_BINBYTES): {
u8 length;
u8 bytes[length];
}
/* ---- Protocol 4 ---- */
(OpcodesEnum::SHORT_BINUNICODE): {
u8 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::BINUNICODE8): {
u64 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::BINBYTES8): {
u64 length;
u8 bytes[length];
}
(OpcodesEnum::EMPTY_SET): {}
(OpcodesEnum::ADDITEMS): {}
(OpcodesEnum::FROZENSET): {}
(OpcodesEnum::NEWOBJ_EX): {}
(OpcodesEnum::STACK_GLOBAL): {}
(OpcodesEnum::MEMOIZE): {}
(OpcodesEnum::FRAME): {
u64 length;
Opcodes opcodes[while($ < addressof(length) + sizeof(length) + length)];
}
/* ---- Protocol 5 ---- */
(OpcodesEnum::BYTEARRAY8): {
u64 length;
u8 array[length];
}
(OpcodesEnum::NEXT_BUFFER): {}
(OpcodesEnum::READONLY_BUFFER): {}
(_): std::error(std::format("Unrecognized {}", opcode));
}
};
struct Pickle{
Opcodes opcodes[while(!std::mem::eof())];
};
Pickle pickle @ 0x0;