[cig-commits] r6889 - in cs/cigma/trunk: . include include/util tests util

Wed May 16 10:08:23 PDT 2007

Author: luis
Date: 2007-05-16 10:08:23 -0700 (Wed, 16 May 2007)
New Revision: 6889

Added:
   cs/cigma/trunk/include/util/
   cs/cigma/trunk/include/util/split.h
   cs/cigma/trunk/tests/test_split.c
   cs/cigma/trunk/util/
   cs/cigma/trunk/util/split.c
Log:
Added util directory for any misc utility functions that don't belong in cigma per se.
First function added is a string split() function, which should behave much like the Python version.



Added: cs/cigma/trunk/include/util/split.h
===================================================================

--- cs/cigma/trunk/include/util/split.h	2007-05-16 17:04:13 UTC (rev 6888)
+++ cs/cigma/trunk/include/util/split.h	2007-05-16 17:08:23 UTC (rev 6889)
@@ -0,0 +1,16 @@
+#ifndef __UTIL_SPLIT_H__
+#define __UTIL_SPLIT_H__
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void split(char *str, int len,
+           char ***split_list, int *split_count,
+           char sep);
+
+void split_free(char **split_list, int split_count);
+
+#ifdef __cplusplus
+}
+#endif
+#endif

Added: cs/cigma/trunk/tests/test_split.c
===================================================================
--- cs/cigma/trunk/tests/test_split.c	2007-05-16 17:04:13 UTC (rev 6888)
+++ cs/cigma/trunk/tests/test_split.c	2007-05-16 17:08:23 UTC (rev 6889)
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <util/split.h>
+
+int main(int argc, char *argv[])
+{
+    /* loop variable */
+    int i;
+
+    /* list of strings for storing the split array */
+    char **list;
+    int n;
+
+    /* string to split */
+    char *str;
+    int len;
+
+    /* separator */
+    char sep;
+
+    if (argc != 3)
+    {
+        fprintf(stderr, "Usage: %s string sep\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    str = strdup(argv[1]);
+    len = strlen(argv[1]);
+    sep = argv[2][0];
+
+
+    split(str, len, &list, &n, sep);
+
+
+    printf("Found %d parts\n", n);
+    for (i = 0; i < n; i++)
+    {
+        printf("\tpart[%d] = \"%s\"\n", i, list[i]);
+    }
+
+    
+    /* clean up */
+    free(str);
+    for (i = 0; i < n; i++)
+    {
+        free(list[i]);
+    }
+    free(list);
+    
+    return EXIT_SUCCESS;
+}

Added: cs/cigma/trunk/util/split.c
===================================================================
--- cs/cigma/trunk/util/split.c	2007-05-16 17:04:13 UTC (rev 6888)
+++ cs/cigma/trunk/util/split.c	2007-05-16 17:08:23 UTC (rev 6889)
@@ -0,0 +1,177 @@
+#include <stdlib.h>
+#include <string.h>
+#include <util/split.h>
+
+
+/*
+ * Data structure for our basic string tokenizer. Basically, the strategy
+ * is to copy the desired string into our buffer, where we can zero out
+ * the separator characters in-place. Using a struct allows us to avoid
+ * defining global variables like in the version of split.c from the project
+ * http://www.nongnu.org/uri/ on which this code is inspired.
+ *
+ */
+
+typedef struct
+{
+    char **tokens;          // tokens array
+    int token_count;        // real size of tokens array
+    int max_token_count;    // size of tokens array
+
+    char *buffer;           // buffer string
+    int buffer_length;      // length of buffer string
+
+} strtok_t;
+
+
+
+/*
+ * In this section, we define the methods for our string tokenizer
+ * object. Namely, a constructor, a destructor, and then the actual
+ * routine to split the string.
+ * 
+ */
+
+static void strtok_init(strtok_t *st)
+{
+    /* starting up with 0 tokens */
+    st->token_count = 0;
+
+    /* setup the initial array sizes */
+    st->max_token_count = 16;               // 16 tokens in array
+    st->buffer_length = 512;                // 512 chars in buffer
+
+    /* allocate enough tokens and initialize buffer */
+    st->buffer = (char *)malloc(st->buffer_length * sizeof(char));
+    st->tokens = (char **)malloc(st->max_token_count * sizeof(char *));
+}
+
+static void strtok_free(strtok_t *st)
+{
+    if (st != NULL)
+    {
+        free(st->buffer);
+        free(st->tokens);
+    }
+}
+
+static void strtok_split(strtok_t *st, char *s, int len, char separator)
+{
+    /* 
+     * First, check whether our buffer is large enough to manipulate
+     * the string s, and if not, reallocate enough memory.
+     */
+    if (st->buffer_length < len)
+    {
+        st->buffer_length = (len < 512) ? 512 : len+1;
+        st->buffer = (char *)realloc(st->buffer, st->buffer_length * sizeof(char));
+    }
+
+    /*
+     * Next, copy the string s into our buffer and tokenize it in-place.
+     * Essentially, zero out the locations where we find the separator
+     * character, while remembering the beginning of each string.
+     */
+    memcpy(st->buffer, s, len);
+    st->buffer[len] = '\0';
+    {
+
+        char *first, *p;
+        int index, last;
+
+        first = st->buffer;
+        last  = st->buffer_length - 1;
+
+        /* remove trailing separators */
+        while (last >= 0 && st->buffer[last] == separator)
+        {
+            st->buffer[last] = '\0';
+            last--;
+        }
+
+        /* remove leading separators */
+        while (*first == separator)
+        {
+            first++;
+        }
+        
+        /* store first token */
+        index = 0;
+        st->tokens[index++] = first;
+        
+        /* keep tokenizing the buffer */
+        for (p = strchr(first, separator);
+             p != NULL;
+             p = strchr(p, separator))
+        {
+            /* separator found -- zero it out */
+            *p = '\0';
+
+            /* make p point to next char */
+            p++;
+
+            /* store the next token */
+            if ((*p != separator) && (*p != '\0'))
+            {
+                st->tokens[index++] = p;
+
+                /* check whether we need to expand our tokens array,
+                 * to make room for the next batch of tokens
+                 */
+                if (index >= (st->max_token_count))
+                {
+                    st->max_token_count += 16;
+                    st->tokens = (char **)realloc(st->tokens,
+                                                  st->max_token_count
+                                                  * sizeof(char *));
+                }
+            }
+        }
+
+        /* store the final count */
+        st->token_count = index;
+    }
+
+    return;
+}
+
+
+
+/*
+ * Finally, we provide a procedural interface to our string tokenizer.
+ * The caller subsumes the responsibility of freeing the newly allocated
+ * list, as well as each individual string in that list.
+ */
+void split(char *str, int len,
+           char ***split_list, int *split_count,
+           char sep)
+{
+    int i;
+    strtok_t tok;
+
+    strtok_init(&tok);
+    strtok_split(&tok, str, len, sep);
+
+    *split_list  = (char **)malloc(tok.token_count * sizeof(char *));
+    *split_count = tok.token_count;
+
+    for (i = 0; i < tok.token_count; i++)
+    {
+        (*split_list)[i] = strdup(tok.tokens[i]);
+    }
+
+    strtok_free(&tok);
+}
+
+void split_free(char **split_list, int split_count)
+{
+    int i;
+    if (split_list != NULL)
+    {
+        for (i = 0; i < split_count; i++)
+        {
+            free(split_list[i]);
+        }
+        free(split_list);
+    }
+}