Нет. Это так просто. Конечный автомат (который представляет собой структуру данных, лежащую в основе регулярного выражения) не имеет памяти, кроме состояния, в котором он находится, и если у вас есть произвольно глубокое вложение, вам нужен произвольно большой автомат, который сталкивается с понятием конечный автомат.
Вы можете сопоставлять вложенные / парные элементы с фиксированной глубиной, где глубина ограничена только вашей памятью, потому что автомат становится очень большим. На практике, однако, вы должны использовать push-down automaton, т. Е. Синтаксический анализатор для контекстно-свободной грамматики, например LL (сверху вниз) или LR (снизу вверх). Вы должны учитывать худшее поведение во время выполнения: O (n ^ 3) по сравнению с O (n), с n = длина (ввод).
Существует множество генераторов синтаксического анализатора, например ANTLR для Java. Найти существующую грамматику для Java (или C) также не сложно. Для получения дополнительной информации: Теория автоматов в Википедии
Надеюсь, это заставит вас начать
Смотрите его в прямом эфире на http://ideone.com/l23He (используя stdin)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const char* getfield(char* line, int num)
{
const char* tok;
for (tok = strtok(line, ";");
tok && *tok;
tok = strtok(NULL, ";\n"))
{
if (!--num)
return tok;
}
return NULL;
}
int main()
{
FILE* stream = fopen("input", "r");
char line[1024];
while (fgets(line, 1024, stream))
{
char* tmp = strdup(line);
printf("Field 3 would be %s\n", getfield(tmp, 3));
// NOTE strtok clobbers tmp
free(tmp);
}
}
Выход:
Field 3 would be nazwisko
Field 3 would be Kowalski
Field 3 would be Nowak
Полный пример, который оставляет поля как строки с нулевым завершением в исходном буфере ввода и обеспечивает доступ к ним через массив указателей на символы. Процессор CSV был подтвержден для работы с полями, заключенными в «двойные кавычки», игнорируя любые символы разделителя внутри них.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// adjust BUFFER_SIZE to suit longest line
#define BUFFER_SIZE 1024 * 1024
#define NUM_FIELDS 10
#define MAXERRS 5
#define RET_OK 0
#define RET_FAIL 1
#define FALSE 0
#define TRUE 1
// char* array will point to fields
char *pFields[NUM_FIELDS];
// field offsets into pFields array:
#define LP 0
#define IMIE 1
#define NAZWISKo 2
#define ULICA 3
#define NUMER 4
#define KOD 5
#define MIEJSCOw 6
#define TELEFON 7
#define EMAIL 8
#define DATA_UR 9
long loadFile(FILE *pFile, long *errcount);
static int loadValues(char *line, long lineno);
static char delim;
long loadFile(FILE *pFile, long *errcount){
char sInputBuf [BUFFER_SIZE];
long lineno = 0L;
if(pFile == NULL)
return RET_FAIL;
while (!feof(pFile)) {
// load line into static buffer
if(fgets(sInputBuf, BUFFER_SIZE-1, pFile)==NULL)
break;
// skip first line (headers)
if(++lineno==1)
continue;
// jump over empty lines
if(strlen(sInputBuf)==0)
continue;
// set pFields array pointers to null-terminated string fields in sInputBuf
if(loadValues(sInputBuf,lineno)==RET_FAIL){
(*errcount)++;
if(*errcount > MAXERRS)
break;
} else {
// On return pFields array pointers point to loaded fields ready for load into DB or whatever
// Fields can be accessed via pFields, e.g.
printf("lp=%s, imie=%s, data_ur=%s\n", pFields[LP], pFields[IMIE], pFields[DATA_UR]);
}
}
return lineno;
}
static int loadValues(char *line, long lineno){
if(line == NULL)
return RET_FAIL;
// chop of last char of input if it is a CR or LF (e.g.Windows file loading in Unix env.)
// can be removed if sure fgets has removed both CR and LF from end of line
if(*(line + strlen(line)-1) == '\r' || *(line + strlen(line)-1) == '\n')
*(line + strlen(line)-1) = '\0';
if(*(line + strlen(line)-1) == '\r' || *(line + strlen(line)-1 )== '\n')
*(line + strlen(line)-1) = '\0';
char *cptr = line;
int fld = 0;
int inquote = FALSE;
char ch;
pFields[fld]=cptr;
while((ch=*cptr) != '\0' && fld < NUM_FIELDS){
if(ch == '"') {
if(! inquote)
pFields[fld]=cptr+1;
else {
*cptr = '\0'; // zero out " and jump over it
}
inquote = ! inquote;
} else if(ch == delim && ! inquote){
*cptr = '\0'; // end of field, null terminate it
pFields[++fld]=cptr+1;
}
cptr++;
}
if(fld > NUM_FIELDS-1){
fprintf(stderr, "Expected field count (%d) exceeded on line %ld\n", NUM_FIELDS, lineno);
return RET_FAIL;
} else if (fld < NUM_FIELDS-1){
fprintf(stderr, "Expected field count (%d) not reached on line %ld\n", NUM_FIELDS, lineno);
return RET_FAIL;
}
return RET_OK;
}
int main(int argc, char **argv)
{
FILE *fp;
long errcount = 0L;
long lines = 0L;
if(argc!=3){
printf("Usage: %s csvfilepath delimiter\n", basename(argv[0]));
return (RET_FAIL);
}
if((delim=argv[2][0])=='\0'){
fprintf(stderr,"delimiter must be specified\n");
return (RET_FAIL);
}
fp = fopen(argv[1] , "r");
if(fp == NULL) {
fprintf(stderr,"Error opening file: %d\n",errno);
return(RET_FAIL);
}
lines=loadFile(fp,&errcount);
fclose(fp);
printf("Processed %ld lines, encountered %ld error(s)\n", lines, errcount);
if(errcount>0)
return(RET_FAIL);
return(RET_OK);
}
Думаю, я бы поделился этим кодом. Это довольно просто, но эффективно. Он обрабатывает разделенные запятыми файлы скобками. Вы можете легко изменить его в соответствии с вашими потребностями.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[])
{
//argv[1] path to csv file
//argv[2] number of lines to skip
//argv[3] length of longest value (in characters)
FILE *pfinput;
unsigned int nSkipLines, currentLine, lenLongestValue;
char *pTempValHolder;
int c;
unsigned int vcpm; //value character marker
int QuotationOnOff; //0 - off, 1 - on
nSkipLines = atoi(argv[2]);
lenLongestValue = atoi(argv[3]);
pTempValHolder = (char*)malloc(lenLongestValue);
if( pfinput = fopen(argv[1],"r") ) {
rewind(pfinput);
currentLine = 1;
vcpm = 0;
QuotationOnOff = 0;
//currentLine > nSkipLines condition skips ignores first argv[2] lines
while( (c = fgetc(pfinput)) != EOF)
{
switch(c)
{
case ',':
if(!QuotationOnOff && currentLine > nSkipLines)
{
pTempValHolder[vcpm] = '\0';
printf("%s,",pTempValHolder);
vcpm = 0;
}
break;
case '\n':
if(currentLine > nSkipLines)
{
pTempValHolder[vcpm] = '\0';
printf("%s\n",pTempValHolder);
vcpm = 0;
}
currentLine++;
break;
case '\"':
if(currentLine > nSkipLines)
{
if(!QuotationOnOff) {
QuotationOnOff = 1;
pTempValHolder[vcpm] = c;
vcpm++;
} else {
QuotationOnOff = 0;
pTempValHolder[vcpm] = c;
vcpm++;
}
}
break;
default:
if(currentLine > nSkipLines)
{
pTempValHolder[vcpm] = c;
vcpm++;
}
break;
}
}
fclose(pfinput);
free(pTempValHolder);
}
return 0;
}
ifstream fs(filenema);
string line = "";
while (getline(fs, line))
{
stringstream linestream(line);
string token = "";
while (getline(linestream, token, ';'))
{
...
}
}
/* csv - read write comma separated value format
* Copyright (c) 2003 Michael B. Allen <mba2000 ioplex.com>
*
* The MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <wchar.h>
#include <wctype.h>
#include "mba/msgno.h"
#include "mba/csv.h"
#define ST_START 1
#define ST_COLLECT 2
#define ST_TAILSPACE 3
#define ST_END_QUOTE 4
struct sinput {
FILE *in;
const unsigned char *src;
size_t sn;
size_t count;
};
struct winput {
const wchar_t *src;
size_t sn;
size_t count;
};
static int
snextch(struct sinput *in)
{
int ch;
if (in->in) {
if ((ch = fgetc(in->in)) == EOF) {
if (ferror(in->in)) {
PMNO(errno);
return -1;
}
return 0;
}
} else {
if (in->sn == 0) {
return 0;
}
ch = *(in->src)++;
in->sn--;
}
in->count++;
return ch;
}
static int
wnextch(struct winput *in)
{
int ch;
if (in->sn == 0) {
return 0;
}
ch = *(in->src)++;
in->sn--;
in->count++;
return ch;
}
static int
csv_parse_str(struct sinput *in,
unsigned char *buf,
size_t bn,
unsigned char *row[],
int rn,
int sep,
int flags)
{
int trim, quotes, ch, state, r, j, t, inquotes;
trim = flags & CSV_TRIM;
quotes = flags & CSV_QUOTES;
state = ST_START;
inquotes = 0;
ch = r = j = t = 0;
memset(row, 0, sizeof(unsigned char *) * rn);
while (rn && bn && (ch = snextch(in)) > 0) {
switch (state) {
case ST_START:
if (ch != '\n' && ch != sep && isspace(ch)) {
if (!trim) {
buf[j++] = ch; bn--;
t = j;
}
break;
} else if (quotes && ch == '"') {
j = t = 0;
state = ST_COLLECT;
inquotes = 1;
break;
}
state = ST_COLLECT;
case ST_COLLECT:
if (inquotes) {
if (ch == '"') {
state = ST_END_QUOTE;
break;
}
} else if (ch == sep || ch == '\n') {
row[r++] = buf; rn--;
if (ch == '\n' && t && buf[t - 1] == '\r') {
t--; bn++; /* crlf -> lf */
}
buf[t] = '\0'; bn--;
buf += t + 1;
j = t = 0;
state = ST_START;
inquotes = 0;
if (ch == '\n') {
rn = 0;
}
break;
} else if (quotes && ch == '"') {
PMNF(errno = EILSEQ, ": unexpected quote in element %d", (r + 1));
return -1;
}
buf[j++] = ch; bn--;
if (!trim || isspace(ch) == 0) {
t = j;
}
break;
case ST_TAILSPACE:
case ST_END_QUOTE:
if (ch == sep || ch == '\n') {
row[r++] = buf; rn--;
buf[j] = '\0'; bn--;
buf += j + 1;
j = t = 0;
state = ST_START;
inquotes = 0;
if (ch == '\n') {
rn = 0;
}
break;
} else if (quotes && ch == '"' && state != ST_TAILSPACE) {
buf[j++] = '"'; bn--; /* nope, just an escaped quote */
t = j;
state = ST_COLLECT;
break;
} else if (isspace(ch)) {
state = ST_TAILSPACE;
break;
}
errno = EILSEQ;
PMNF(errno, ": bad end quote in element %d", (r + 1));
return -1;
}
}
if (ch == -1) {
AMSG("");
return -1;
}
if (bn == 0) {
PMNO(errno = E2BIG);
return -1;
}
if (rn) {
if (inquotes && state != ST_END_QUOTE) {
PMNO(errno = EILSEQ);
return -1;
}
row[r] = buf;
buf[t] = '\0';
}
return in->count;
}
static int
csv_parse_wcs(struct winput *in, wchar_t *buf, size_t bn, wchar_t *row[], int rn, wint_t sep, int flags)
{
int trim, quotes, state, r, j, t, inquotes;
wint_t ch;
trim = flags & CSV_TRIM;
quotes = flags & CSV_QUOTES;
state = ST_START;
inquotes = 0;
ch = r = j = t = 0;
memset(row, 0, sizeof(wchar_t *) * rn);
while (rn && bn && (ch = wnextch(in)) > 0) {
switch (state) {
case ST_START:
if (ch != L'\n' && ch != sep && iswspace(ch)) {
if (!trim) {
buf[j++] = ch; bn--;
t = j;
}
break;
} else if (quotes && ch == L'"') {
j = t = 0;
state = ST_COLLECT;
inquotes = 1;
break;
}
state = ST_COLLECT;
case ST_COLLECT:
if (inquotes) {
if (ch == L'"') {
state = ST_END_QUOTE;
break;
}
} else if (ch == sep || ch == L'\n') {
row[r++] = buf; rn--;
buf[t] = L'\0'; bn--;
buf += t + 1;
j = t = 0;
state = ST_START;
inquotes = 0;
if (ch == L'\n') {
rn = 0;
}
break;
} else if (quotes && ch == L'"') {
PMNF(errno = EILSEQ, ": unexpected quote in element %d", (r + 1));
return -1;
}
buf[j++] = ch; bn--;
if (!trim || iswspace(ch) == 0) {
t = j;
}
break;
case ST_TAILSPACE:
case ST_END_QUOTE:
if (ch == sep || ch == L'\n') {
row[r++] = buf; rn--;
buf[j] = L'\0'; bn--;
buf += j + 1;
j = t = 0;
state = ST_START;
inquotes = 0;
if (ch == L'\n') {
rn = 0;
}
break;
} else if (quotes && ch == L'"' && state != ST_TAILSPACE) {
buf[j++] = L'"'; bn--; /* nope, just an escaped quote */
t = j;
state = ST_COLLECT;
break;
} else if (iswspace(ch)) {
state = ST_TAILSPACE;
break;
}
PMNF(errno = EILSEQ, ": bad end quote in element %d", (r + 1));
return -1;
}
}
if (ch == (wint_t)-1) {
AMSG("");
return -1;
}
if (bn == 0) {
PMNO(errno = E2BIG);
return -1;
}
if (rn) {
if (inquotes && state != ST_END_QUOTE) {
PMNO(errno = EILSEQ);
return -1;
}
row[r] = buf;
buf[t] = L'\0';
}
return in->count;
}
int
csv_row_parse_wcs(const wchar_t *src, size_t sn, wchar_t *buf, size_t bn, wchar_t *row[], int rn, int sep, int trim)
{
struct winput input;
input.src = src;
input.sn = sn;
input.count = 0;
return csv_parse_wcs(&input, buf, bn, row, rn, (wint_t)sep, trim);
}
int
csv_row_parse_str(const unsigned char *src, size_t sn, unsigned char *buf, size_t bn, unsigned char *row[], int rn, int sep, int trim)
{
struct sinput input;
input.in = NULL;
input.src = src;
input.sn = sn;
input.count = 0;
return csv_parse_str(&input, buf, bn, row, rn, sep, trim);
}
int
csv_row_fread(FILE *in, unsigned char *buf, size_t bn, unsigned char *row[], int numcols, int sep, int trim)
{
struct sinput input;
input.in = in;
input.count = 0;
return csv_parse_str(&input, buf, bn, row, numcols, sep, trim);
}
"mba/msgno.h"
и "mba/csv.h"
.
– Toby Speight
15 March 2018 в 15:22
Следующий код имеет простой язык c и обрабатывает пробелы. Он выделяет память только один раз, поэтому для каждой обрабатываемой строки требуется одна свободная ().
/* Tiny CSV Reader */
/* Copyright (C) 2015, Deligiannidis Konstantinos
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://w...content-available-to-author-only...u.org/licenses/>. */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/* For more that 100 columns or lines (when delimiter = \n), minor modifications are needed. */
int getcols( const char * const line, const char * const delim, char ***out_storage )
{
const char *start_ptr, *end_ptr, *iter;
char **out;
int i; //For "for" loops in the old c style.
int tokens_found = 1, delim_size, line_size; //Calculate "line_size" indirectly, without strlen() call.
int start_idx[100], end_idx[100]; //Store the indexes of tokens. Example "Power;": loc('P')=1, loc(';')=6
//Change 100 with MAX_TOKENS or use malloc() for more than 100 tokens. Example: "b1;b2;b3;...;b200"
if ( *out_storage != NULL ) return -4; //This SHOULD be NULL: Not Already Allocated
if ( !line || !delim ) return -1; //NULL pointers Rejected Here
if ( (delim_size = strlen( delim )) == 0 ) return -2; //Delimiter not provided
start_ptr = line; //Start visiting input. We will distinguish tokens in a single pass, for good performance.
//Then we are allocating one unified memory region & doing one memory copy.
while ( ( end_ptr = strstr( start_ptr, delim ) ) ) {
start_idx[ tokens_found -1 ] = start_ptr - line; //Store the Index of current token
end_idx[ tokens_found - 1 ] = end_ptr - line; //Store Index of first character that will be replaced with
//'\0'. Example: "arg1||arg2||end" -> "arg1\0|arg2\0|end"
tokens_found++; //Accumulate the count of tokens.
start_ptr = end_ptr + delim_size; //Set pointer to the next c-string within the line
}
for ( iter = start_ptr; (*iter!='\0') ; iter++ );
start_idx[ tokens_found -1 ] = start_ptr - line; //Store the Index of current token: of last token here.
end_idx[ tokens_found -1 ] = iter - line; //and the last element that will be replaced with \0
line_size = iter - line; //Saving CPU cycles: Indirectly Count the size of *line without using strlen();
int size_ptr_region = (1 + tokens_found)*sizeof( char* ); //The size to store pointers to c-strings + 1 (*NULL).
out = (char**) malloc( size_ptr_region + ( line_size + 1 ) + 5 ); //Fit everything there...it is all memory.
//It reserves a contiguous space for both (char**) pointers AND string region. 5 Bytes for "Out of Range" tests.
*out_storage = out; //Update the char** pointer of the caller function.
//"Out of Range" TEST. Verify that the extra reserved characters will not be changed. Assign Some Values.
//char *extra_chars = (char*) out + size_ptr_region + ( line_size + 1 );
//extra_chars[0] = 1; extra_chars[1] = 2; extra_chars[2] = 3; extra_chars[3] = 4; extra_chars[4] = 5;
for ( i = 0; i < tokens_found; i++ ) //Assign adresses first part of the allocated memory pointers that point to
out[ i ] = (char*) out + size_ptr_region + start_idx[ i ]; //the second part of the memory, reserved for Data.
out[ tokens_found ] = (char*) NULL; //[ ptr1, ptr2, ... , ptrN, (char*) NULL, ... ]: We just added the (char*) NULL.
//Now assign the Data: c-strings. (\0 terminated strings):
char *str_region = (char*) out + size_ptr_region; //Region inside allocated memory which contains the String Data.
memcpy( str_region, line, line_size ); //Copy input with delimiter characters: They will be replaced with \0.
//Now we should replace: "arg1||arg2||arg3" with "arg1\0|arg2\0|arg3". Don't worry for characters after '\0'
//They are not used in standard c lbraries.
for( i = 0; i < tokens_found; i++) str_region[ end_idx[ i ] ] = '\0';
//"Out of Range" TEST. Wait until Assigned Values are Printed back.
//for ( int i=0; i < 5; i++ ) printf("c=%x ", extra_chars[i] ); printf("\n");
// *out memory should now contain (example data):
//[ ptr1, ptr2,...,ptrN, (char*) NULL, "token1\0", "token2\0",...,"tokenN\0", 5 bytes for tests ]
// |__________________________________^ ^ ^ ^
// |_______________________________________| | |
// |_____________________________________________| These 5 Bytes should be intact.
return tokens_found;
}
int main()
{
char in_line[] = "Arg1;;Th;s is not Del;m;ter;;Arg3;;;;Final";
char delim[] = ";;";
char **columns;
int i;
printf("Example1:\n");
columns = NULL; //Should be NULL to indicate that it is not assigned to allocated memory. Otherwise return -4;
int cols_found = getcols( in_line, delim, &columns);
for ( i = 0; i < cols_found; i++ ) printf("Column[ %d ] = %s\n", i, columns[ i ] ); //<- (1st way).
// (2nd way) // for ( i = 0; columns[ i ]; i++) printf("start_idx[ %d ] = %s\n", i, columns[ i ] );
free( columns ); //Release the Single Contiguous Memory Space.
columns = NULL; //Pointer = NULL to indicate it does not reserve space and that is ready for the next malloc().
printf("\n\nExample2, Nested:\n\n");
char example_file[] = "ID;Day;Month;Year;Telephone;email;Date of registration\n"
"1;Sunday;january;2009;123-124-456;jitter@go.xyz;2015-05-13\n"
"2;Monday;March;2011;(+30)333-22-55;buffer@wl.it;2009-05-23";
char **rows;
int j;
rows = NULL; //getcols() requires it to be NULL. (Avoid dangling pointers, leaks e.t.c).
getcols( example_file, "\n", &rows);
for ( i = 0; rows[ i ]; i++) {
{
printf("Line[ %d ] = %s\n", i, rows[ i ] );
char **columnX = NULL;
getcols( rows[ i ], ";", &columnX);
for ( j = 0; columnX[ j ]; j++) printf(" Col[ %d ] = %s\n", j, columnX[ j ] );
free( columnX );
}
}
free( rows );
rows = NULL;
return 0;
}
strtok
не может обрабатывать пустые узлы, как бы вы приблизились к строке ввода, например"A1,B2,C3,,F5,G6"
, я использую комбинациюstrchr
иstrcpy
, но Im имеет проблемы с получением значения «G6». – ProfessionalAmateur 17 September 2014 в 21:09Tokens
правильно, слишком много времени тратит на XML. – ProfessionalAmateur 17 September 2014 в 22:29