Skip to content

Commit 3683af6

Browse files
committed
Speed up byteain by not parsing traditional-style input twice.
Instead of laboriously computing the exact output length, use strlen to get an upper bound cheaply. (This is still O(N) of course, but the constant factor is a lot less.) This will typically result in overallocating the output datum, but that's of little concern since it's a short-lived allocation in just about all use-cases. A simple microbenchmark showed about 40% speedup for long input strings. While here, make some cosmetic cleanups and add a test case that covers the double-backslash code path in byteain and byteaout. Author: Steven Niu <niushiji@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Stepan Neretin <slpmcf@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/ca315729-140b-426e-81a6-6cd5cfe7ecc5@gmail.com
1 parent 84409ed commit 3683af6

File tree

3 files changed

+30
-45
lines changed

3 files changed

+30
-45
lines changed

src/backend/utils/adt/bytea.c

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -182,27 +182,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
182182
*
183183
* Non-printable characters must be passed as '\nnn' (octal) and are
184184
* converted to internal form. '\' must be passed as '\\'.
185-
* ereport(ERROR, ...) if bad form.
186-
*
187-
* BUGS:
188-
* The input is scanned twice.
189-
* The error checking of input is minimal.
190185
*/
191186
Datum
192187
byteain(PG_FUNCTION_ARGS)
193188
{
194189
char *inputText = PG_GETARG_CSTRING(0);
195190
Node *escontext = fcinfo->context;
191+
size_t len = strlen(inputText);
192+
size_t bc;
196193
char *tp;
197194
char *rp;
198-
int bc;
199195
bytea *result;
200196

201197
/* Recognize hex input */
202198
if (inputText[0] == '\\' && inputText[1] == 'x')
203199
{
204-
size_t len = strlen(inputText);
205-
206200
bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
207201
result = palloc(bc);
208202
bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
@@ -213,70 +207,47 @@ byteain(PG_FUNCTION_ARGS)
213207
}
214208

215209
/* Else, it's the traditional escaped style */
216-
for (bc = 0, tp = inputText; *tp != '\0'; bc++)
217-
{
218-
if (tp[0] != '\\')
219-
tp++;
220-
else if ((tp[0] == '\\') &&
221-
(tp[1] >= '0' && tp[1] <= '3') &&
222-
(tp[2] >= '0' && tp[2] <= '7') &&
223-
(tp[3] >= '0' && tp[3] <= '7'))
224-
tp += 4;
225-
else if ((tp[0] == '\\') &&
226-
(tp[1] == '\\'))
227-
tp += 2;
228-
else
229-
{
230-
/*
231-
* one backslash, not followed by another or ### valid octal
232-
*/
233-
ereturn(escontext, (Datum) 0,
234-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
235-
errmsg("invalid input syntax for type %s", "bytea")));
236-
}
237-
}
238-
239-
bc += VARHDRSZ;
240-
241-
result = (bytea *) palloc(bc);
242-
SET_VARSIZE(result, bc);
210+
result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */
243211

244212
tp = inputText;
245213
rp = VARDATA(result);
246214
while (*tp != '\0')
247215
{
248216
if (tp[0] != '\\')
249217
*rp++ = *tp++;
250-
else if ((tp[0] == '\\') &&
251-
(tp[1] >= '0' && tp[1] <= '3') &&
218+
else if ((tp[1] >= '0' && tp[1] <= '3') &&
252219
(tp[2] >= '0' && tp[2] <= '7') &&
253220
(tp[3] >= '0' && tp[3] <= '7'))
254221
{
255-
bc = VAL(tp[1]);
256-
bc <<= 3;
257-
bc += VAL(tp[2]);
258-
bc <<= 3;
259-
*rp++ = bc + VAL(tp[3]);
222+
int v;
223+
224+
v = VAL(tp[1]);
225+
v <<= 3;
226+
v += VAL(tp[2]);
227+
v <<= 3;
228+
*rp++ = v + VAL(tp[3]);
260229

261230
tp += 4;
262231
}
263-
else if ((tp[0] == '\\') &&
264-
(tp[1] == '\\'))
232+
else if (tp[1] == '\\')
265233
{
266234
*rp++ = '\\';
267235
tp += 2;
268236
}
269237
else
270238
{
271239
/*
272-
* We should never get here. The first pass should not allow it.
240+
* one backslash, not followed by another or ### valid octal
273241
*/
274242
ereturn(escontext, (Datum) 0,
275243
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
276244
errmsg("invalid input syntax for type %s", "bytea")));
277245
}
278246
}
279247

248+
bc = rp - VARDATA(result); /* actual length */
249+
SET_VARSIZE(result, bc + VARHDRSZ);
250+
280251
PG_RETURN_BYTEA_P(result);
281252
}
282253

src/test/regress/expected/strings.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,12 @@ SELECT E'De\\678dBeEf'::bytea;
236236
ERROR: invalid input syntax for type bytea
237237
LINE 1: SELECT E'De\\678dBeEf'::bytea;
238238
^
239+
SELECT E'DeAd\\\\BeEf'::bytea;
240+
bytea
241+
----------------------
242+
\x446541645c42654566
243+
(1 row)
244+
239245
SELECT reverse(''::bytea);
240246
reverse
241247
---------
@@ -291,6 +297,12 @@ SELECT E'De\\123dBeEf'::bytea;
291297
DeSdBeEf
292298
(1 row)
293299

300+
SELECT E'DeAd\\\\BeEf'::bytea;
301+
bytea
302+
------------
303+
DeAd\\BeEf
304+
(1 row)
305+
294306
-- Test non-error-throwing API too
295307
SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea');
296308
pg_input_is_valid

src/test/regress/sql/strings.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ SELECT E'De\\000dBeEf'::bytea;
7676
SELECT E'De\123dBeEf'::bytea;
7777
SELECT E'De\\123dBeEf'::bytea;
7878
SELECT E'De\\678dBeEf'::bytea;
79+
SELECT E'DeAd\\\\BeEf'::bytea;
7980
8081
SELECT reverse(''::bytea);
8182
SELECT reverse('\xaa'::bytea);
@@ -88,6 +89,7 @@ SELECT E'\\xDe00BeEf'::bytea;
8889
SELECT E'DeAdBeEf'::bytea;
8990
SELECT E'De\\000dBeEf'::bytea;
9091
SELECT E'De\\123dBeEf'::bytea;
92+
SELECT E'DeAd\\\\BeEf'::bytea;
9193
9294
-- Test non-error-throwing API too
9395
SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea');

0 commit comments

Comments
 (0)