solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
regex.c
1#include "../include/regex.h"
2
3#include <assert.h> /* for assert */
4#include <stdatomic.h>/* for atomic_fetch_add, atomic_fetch_sub */
5#include <stdio.h> /* for snprintf */
6#include <stdlib.h> /* for malloc, free */
7#include <string.h> /* for strlen, strncpy */
8
13struct regex_s {
14 pcre2_code* code;
15 char* pattern;
16 uint32_t group_count;
17 atomic_int refcount;
18};
19
24struct regex_ctx_s {
25 pcre2_match_data* match_data;
26};
27
31struct regex_iter_s {
32 regex_t* re;
33 regex_ctx_t* ctx;
34 const char* subject;
35 size_t len;
36 size_t offset;
37};
38
39/* ---------------------------------------------------------------------------
40 * Internal helpers
41 * ------------------------------------------------------------------------- */
42
50static void pcre2_err_message(int errcode, char* buf, size_t buf_len) {
51 PCRE2_UCHAR8 tmp[256];
52 if (pcre2_get_error_message(errcode, tmp, sizeof(tmp)) < 0) {
53 snprintf(buf, buf_len, "PCRE2 error %d (no message available)", errcode);
54 } else {
55 snprintf(buf, buf_len, "%s", (const char*)tmp);
56 }
57}
58
68static void fill_match(const regex_t* re, pcre2_match_data* md, int rc, regex_match_t* match) {
69 const PCRE2_SIZE* ov = pcre2_get_ovector_pointer(md);
70
71 /* The match count from pcre2_match is the number of *filled* pairs;
72 * groups beyond that still exist but were not captured this run. */
73 uint32_t total = re->group_count + 1; /* include g0 */
74 if (total > REGEX_MAX_GROUPS) {
75 total = REGEX_MAX_GROUPS;
76 }
77 match->count = total;
78
79 for (uint32_t i = 0; i < total; i++) {
80 match->group[i].start = ov[2 * i];
81 match->group[i].end = ov[2 * i + 1];
82 }
83
84 /* Zero out any trailing slots not covered by this match. */
85 for (uint32_t i = total; i < REGEX_MAX_GROUPS; i++) {
86 match->group[i].start = 0;
87 match->group[i].end = 0;
88 }
89
90 (void)rc; /* rc is used implicitly via the ovector; suppress unused warning */
91}
92
93regex_status_t regex_compile(const char* pattern, regex_flags_t flags, regex_t** out, char* errbuf, size_t errbuf_len) {
94 if (pattern == NULL || out == NULL) {
95 return REGEX_ERROR_ARGS;
96 }
97 *out = NULL;
98
99 /* Compile via PCRE2. */
100 int errcode = 0;
101 PCRE2_SIZE erroffset = 0;
102
103 pcre2_code* code = pcre2_compile((PCRE2_SPTR8)pattern, PCRE2_ZERO_TERMINATED, (uint32_t)flags, &errcode, &erroffset,
104 NULL /* use default compile context */);
105
106 if (code == NULL) {
107 if (errbuf != NULL && errbuf_len > 0) {
108 PCRE2_UCHAR8 tmp[256];
109 pcre2_get_error_message(errcode, tmp, sizeof(tmp));
110 snprintf(errbuf, errbuf_len, "pattern error at offset %zu: %s", (size_t)erroffset, (const char*)tmp);
111 }
112 return REGEX_ERROR;
113 }
114
115 /* Study / JIT-compile for speed if JIT is available. */
116 pcre2_jit_compile(code, PCRE2_JIT_COMPLETE);
117
118 /* Query capture group count before committing. */
119 uint32_t group_count = 0;
120 pcre2_pattern_info(code, PCRE2_INFO_CAPTURECOUNT, &group_count);
121
122 if (group_count + 1 > REGEX_MAX_GROUPS) {
123 /* The total slots needed (groups + g0) exceed our cap. */
124 pcre2_code_free(code);
125 if (errbuf != NULL && errbuf_len > 0) {
126 snprintf(errbuf, errbuf_len, "pattern has %u capture groups; limit is %d", group_count,
127 REGEX_MAX_GROUPS - 1);
128 }
129 return REGEX_ERROR_LIMIT;
130 }
131
132 /* Duplicate the pattern string for later introspection. */
133 char* pat_dup = strdup(pattern);
134 if (pat_dup == NULL) {
135 pcre2_code_free(code);
136 return REGEX_ERROR_NOMEM;
137 }
138
139 /* Allocate and initialise the regex_t wrapper. */
140 regex_t* re = malloc(sizeof(*re));
141 if (re == NULL) {
142 free(pat_dup);
143 pcre2_code_free(code);
144 return REGEX_ERROR_NOMEM;
145 }
146
147 *re = (regex_t){
148 .code = code,
149 .pattern = pat_dup,
150 .group_count = group_count,
151 /* refcount initialised below via atomic store */
152 };
153 atomic_store(&re->refcount, 1);
154
155 *out = re;
156 return REGEX_OK;
157}
158
160 if (re != NULL) {
161 atomic_fetch_add(&re->refcount, 1);
162 }
163 return re;
164}
165
167 if (re == NULL) {
168 return;
169 }
170
171 if (atomic_fetch_sub(&re->refcount, 1) == 1) {
172 /* Last reference: tear down. */
173 pcre2_code_free(re->code);
174 free(re->pattern);
175 free(re);
176 }
177}
178
179/* ---------------------------------------------------------------------------
180 * Execution context
181 * ------------------------------------------------------------------------- */
182
184 if (out == NULL) {
185 return REGEX_ERROR_ARGS;
186 }
187 *out = NULL;
188
189 regex_ctx_t* ctx = malloc(sizeof(*ctx));
190 if (ctx == NULL) {
191 return REGEX_ERROR_NOMEM;
192 }
193
194 /* Allocate the match-data block for up to REGEX_MAX_GROUPS pairs.
195 * We use the "from pattern" variant with a NULL code pointer so we can
196 * pass an explicit pair count. That variant is pcre2_match_data_create,
197 * which takes an ovector pair count directly. */
198 ctx->match_data = pcre2_match_data_create(REGEX_MAX_GROUPS, NULL);
199 if (ctx->match_data == NULL) {
200 free(ctx);
201 return REGEX_ERROR_NOMEM;
202 }
203
204 *out = ctx;
205 return REGEX_OK;
206}
207
209 if (ctx == NULL) {
210 return;
211 }
212 pcre2_match_data_free(ctx->match_data);
213 free(ctx);
214}
215
216/* ---------------------------------------------------------------------------
217 * Matching
218 * ------------------------------------------------------------------------- */
219
220regex_status_t regex_exec(const regex_t* re, regex_ctx_t* ctx, const char* subject, size_t len, size_t offset,
221 regex_match_t* match) {
222 if (re == NULL || ctx == NULL || subject == NULL || match == NULL) {
223 return REGEX_ERROR_ARGS;
224 }
225 if (offset > len) {
226 return REGEX_ERROR_ARGS;
227 }
228
229 int rc = pcre2_match(re->code, (PCRE2_SPTR8)subject, (PCRE2_SIZE)len, (PCRE2_SIZE)offset, 0 /* no extra flags */,
230 ctx->match_data, NULL /* use default match context */);
231
232 if (rc == PCRE2_ERROR_NOMATCH) {
233 return REGEX_NO_MATCH;
234 }
235 if (rc < 0) {
236 return REGEX_ERROR;
237 }
238
239 fill_match(re, ctx->match_data, rc, match);
240 return REGEX_OK;
241}
242
243regex_status_t regex_match(const regex_t* re, regex_ctx_t* ctx, const char* subject, regex_match_t* match) {
244 if (subject == NULL) {
245 return REGEX_ERROR_ARGS;
246 }
247 return regex_exec(re, ctx, subject, strlen(subject), 0, match);
248}
249
250bool regex_is_match(const regex_t* re, regex_ctx_t* ctx, const char* subject, size_t len) {
251 if (re == NULL || ctx == NULL || subject == NULL) {
252 return false;
253 }
254
255 int rc = pcre2_match(re->code, (PCRE2_SPTR8)subject, (PCRE2_SIZE)len, 0, 0, ctx->match_data, NULL);
256
257 return rc > 0;
258}
259
260/* ---------------------------------------------------------------------------
261 * Iterator
262 * ------------------------------------------------------------------------- */
263
264regex_status_t regex_iter_init(regex_t* re, regex_ctx_t* ctx, const char* subject, size_t len, regex_iter_t** out) {
265 if (re == NULL || ctx == NULL || subject == NULL || out == NULL) {
266 return REGEX_ERROR_ARGS;
267 }
268 *out = NULL;
269
270 regex_iter_t* iter = malloc(sizeof(*iter));
271 if (iter == NULL) {
272 return REGEX_ERROR_NOMEM;
273 }
274
275 *iter = (regex_iter_t){
276 .re = regex_retain(re),
277 .ctx = ctx,
278 .subject = subject,
279 .len = len,
280 .offset = 0,
281 };
282
283 *out = iter;
284 return REGEX_OK;
285}
286
288 if (iter == NULL || match == NULL) {
289 return REGEX_ERROR_ARGS;
290 }
291 if (iter->offset > iter->len) {
292 return REGEX_NO_MATCH;
293 }
294
295 int rc = pcre2_match(iter->re->code, (PCRE2_SPTR8)iter->subject, (PCRE2_SIZE)iter->len, (PCRE2_SIZE)iter->offset, 0,
296 iter->ctx->match_data, NULL);
297
298 if (rc == PCRE2_ERROR_NOMATCH) {
299 return REGEX_NO_MATCH;
300 }
301 if (rc < 0) {
302 return REGEX_ERROR;
303 }
304
305 fill_match(iter->re, iter->ctx->match_data, rc, match);
306
307 /* Advance the offset past this match to avoid re-matching.
308 * If the match is zero-length we must advance by at least one byte to
309 * prevent an infinite loop. This mirrors the behaviour of most regex
310 * engines (Perl, Python, Go) for zero-width matches. */
311 size_t end = match->group[0].end;
312 if (end == iter->offset) {
313 iter->offset = end + 1;
314 } else {
315 iter->offset = end;
316 }
317
318 return REGEX_OK;
319}
320
322 if (iter == NULL) {
323 return;
324 }
325 regex_free(iter->re);
326 free(iter);
327}
328
329/* ---------------------------------------------------------------------------
330 * Substitution (shared implementation)
331 * ------------------------------------------------------------------------- */
332
337static regex_status_t sub_impl(const regex_t* re, regex_ctx_t* ctx, const char* subject, size_t subject_len,
338 const char* replacement, char* out_buf, size_t* out_len, uint32_t pcre2_flags) {
339 if (re == NULL || ctx == NULL || subject == NULL || replacement == NULL || out_buf == NULL || out_len == NULL) {
340 return REGEX_ERROR_ARGS;
341 }
342
343 PCRE2_SIZE result_len = (PCRE2_SIZE)*out_len;
344
345 int rc = pcre2_substitute(re->code, (PCRE2_SPTR8)subject, (PCRE2_SIZE)subject_len, 0 /* start offset */,
346 pcre2_flags | PCRE2_SUBSTITUTE_EXTENDED, ctx->match_data, NULL /* match context */,
347 (PCRE2_SPTR8)replacement, PCRE2_ZERO_TERMINATED, (PCRE2_UCHAR8*)out_buf, &result_len);
348
349 if (rc == PCRE2_ERROR_NOMATCH || rc == 0) {
350 return REGEX_NO_MATCH;
351 }
352 if (rc == PCRE2_ERROR_NOMEMORY) {
353 /* result_len now holds the required size including NUL. */
354 *out_len = (size_t)result_len;
355 return REGEX_ERROR;
356 }
357 if (rc < 0) {
358 return REGEX_ERROR;
359 }
360
361 /* result_len is the number of code units written, excluding NUL. */
362 *out_len = (size_t)result_len;
363 return REGEX_OK;
364}
365
366regex_status_t regex_sub(const regex_t* re, regex_ctx_t* ctx, const char* subject, size_t subject_len,
367 const char* replacement, char* out_buf, size_t* out_len) {
368 return sub_impl(re, ctx, subject, subject_len, replacement, out_buf, out_len, 0 /* replace first match only */);
369}
370
371regex_status_t regex_gsub(const regex_t* re, regex_ctx_t* ctx, const char* subject, size_t subject_len,
372 const char* replacement, char* out_buf, size_t* out_len) {
373 return sub_impl(re, ctx, subject, subject_len, replacement, out_buf, out_len, PCRE2_SUBSTITUTE_GLOBAL);
374}
375
376/* ---------------------------------------------------------------------------
377 * Introspection
378 * ------------------------------------------------------------------------- */
379
380uint32_t regex_group_count(const regex_t* re) {
381 if (re == NULL) {
382 return 0;
383 }
384 return re->group_count;
385}
386
387const char* regex_pattern(const regex_t* re) {
388 if (re == NULL) {
389 return "";
390 }
391 return re->pattern;
392}
393
394void regex_strerror(regex_status_t status, char* buf, size_t buf_len) {
395 if (buf == NULL || buf_len == 0) {
396 return;
397 }
398 const char* msg;
399 switch (status) {
400 case REGEX_OK:
401 msg = "success";
402 break;
403 case REGEX_NO_MATCH:
404 msg = "no match";
405 break;
406 case REGEX_ERROR:
407 msg = "general error";
408 break;
410 msg = "memory allocation failed";
411 break;
412 case REGEX_ERROR_ARGS:
413 msg = "invalid arguments";
414 break;
416 msg = "too many capture groups";
417 break;
418 default:
419 msg = "unknown status code";
420 break;
421 }
422 snprintf(buf, buf_len, "%s", msg);
423}
void regex_iter_free(regex_iter_t *iter)
Definition regex.c:321
regex_t * regex_retain(regex_t *re)
Definition regex.c:159
struct regex_ctx_s regex_ctx_t
Definition regex.h:92
struct regex_iter_s regex_iter_t
Definition regex.h:124
#define REGEX_MAX_GROUPS
Definition regex.h:40
uint32_t regex_flags_t
Definition regex.h:61
uint32_t regex_group_count(const regex_t *re)
Definition regex.c:380
const char * regex_pattern(const regex_t *re)
Definition regex.c:387
regex_status_t
Definition regex.h:47
@ REGEX_ERROR_NOMEM
Definition regex.h:51
@ REGEX_OK
Definition regex.h:48
@ REGEX_ERROR
Definition regex.h:50
@ REGEX_ERROR_LIMIT
Definition regex.h:53
@ REGEX_ERROR_ARGS
Definition regex.h:52
@ REGEX_NO_MATCH
Definition regex.h:49
regex_status_t regex_ctx_create(regex_ctx_t **out)
Definition regex.c:183
regex_status_t regex_sub(const regex_t *re, regex_ctx_t *ctx, const char *subject, size_t subject_len, const char *replacement, char *out_buf, size_t *out_len)
Definition regex.c:366
struct regex_s regex_t
Definition regex.h:83
void regex_strerror(regex_status_t status, char *buf, size_t buf_len)
Definition regex.c:394
regex_status_t regex_compile(const char *pattern, regex_flags_t flags, regex_t **out, char *errbuf, size_t errbuf_len)
Definition regex.c:93
regex_status_t regex_match(const regex_t *re, regex_ctx_t *ctx, const char *subject, regex_match_t *match)
Definition regex.c:243
regex_status_t regex_iter_next(regex_iter_t *iter, regex_match_t *match)
Definition regex.c:287
regex_status_t regex_gsub(const regex_t *re, regex_ctx_t *ctx, const char *subject, size_t subject_len, const char *replacement, char *out_buf, size_t *out_len)
Definition regex.c:371
regex_status_t regex_exec(const regex_t *re, regex_ctx_t *ctx, const char *subject, size_t len, size_t offset, regex_match_t *match)
Definition regex.c:220
void regex_ctx_free(regex_ctx_t *ctx)
Definition regex.c:208
bool regex_is_match(const regex_t *re, regex_ctx_t *ctx, const char *subject, size_t len)
Definition regex.c:250
void regex_free(regex_t *re)
Definition regex.c:166
regex_status_t regex_iter_init(regex_t *re, regex_ctx_t *ctx, const char *subject, size_t len, regex_iter_t **out)
Definition regex.c:264
regex_span_t group[REGEX_MAX_GROUPS]
Definition regex.h:114
uint32_t count
Definition regex.h:115
size_t start
Definition regex.h:102
size_t end
Definition regex.h:103