/*
    massip-parse

    This module parses IPv4 and IPv6 addresses.

    It's not a typical parser. It's optimized around parsing large
    files containing millions of addresses and ranges using a 
    "state-machine parser".
*/
#include "massip.h"
#include "massip-parse.h"
#include "massip-rangesv4.h"
#include "massip-rangesv6.h"
#include "util-logger.h"
#include "util-bool.h"
#include "util-malloc.h"
#include "util-safefunc.h"
#include "unusedparm.h"

#include <string.h>

struct massip_parser
{
    unsigned long long line_number;
    unsigned long long char_number;
    unsigned state;
    unsigned tmp;
    unsigned char digit_count;
    unsigned addr;
    unsigned begin;
    unsigned end;
    struct {
        ipv6address _begin;
        ipv6address _end;
        unsigned short tmp[8];
        unsigned char index;
        unsigned char ellision_index;
        unsigned is_bracket:1;
        unsigned is_second:1;
    } ipv6;
};

/***************************************************************************
 ***************************************************************************/
static struct massip_parser *
_parser_init(struct massip_parser *p)
{
    memset(p, 0, sizeof(*p));
    p->line_number = 1;
    p->ipv6.ellision_index = 8;
    return p;
}

/***************************************************************************
 ***************************************************************************/
static void
_parser_destroy(struct massip_parser *p)
{
    UNUSEDPARM(p);
}

/***************************************************************************
 ***************************************************************************/
static void
_parser_err(struct massip_parser *p, unsigned long long *line_number, unsigned long long *charindex)
{
    *line_number = p->line_number;
    *charindex = p->char_number;
}

/** 
 * Called before parsing the first address in a pair, and also
 * after the first address, to prepare for parsing the next
 * address
 */
static void
_init_next_address(struct massip_parser *p, int is_second)
{
    p->tmp = 0;
    p->ipv6.ellision_index = 8;
    p->ipv6.index = 0;
    p->ipv6.is_bracket = 0;
    p->digit_count = 0;
    p->ipv6.is_second = is_second;
}



static unsigned
_parser_finish_ipv6(struct massip_parser *p)
{
    unsigned index = p->ipv6.index;
    unsigned ellision = p->ipv6.ellision_index;
    

    /* We must have seen 8 numbers, or an ellision */
    if (index < 8 && ellision >= 8)
        return 1;
    
    /* Handle ellision */
    memmove(
        &p->ipv6.tmp[8-(index-ellision)],
        &p->ipv6.tmp[ellision],
        sizeof(p->ipv6.tmp[0]) * (index-ellision)
        );
    memset(
        &p->ipv6.tmp[ellision],
        0,
        sizeof(p->ipv6.tmp[0]) * (8 - index)
    );
    
    /* Copy over to begin/end. We parse the address as a series of 16-bit
     * integers, but return the result as two 64-bit integers */
    {
        ipv6address a;
        a.hi = (uint64_t)p->ipv6.tmp[0] << 48ULL
                | (uint64_t)p->ipv6.tmp[1] << 32ULL
                | (uint64_t)p->ipv6.tmp[2] << 16ULL
                | (uint64_t)p->ipv6.tmp[3] << 0ULL;
        a.lo = (uint64_t)p->ipv6.tmp[4] << 48ULL
                | (uint64_t)p->ipv6.tmp[5] << 32ULL
                | (uint64_t)p->ipv6.tmp[6] << 16ULL
                | (uint64_t)p->ipv6.tmp[7] << 0ULL;
        if (p->ipv6.is_second)
            p->ipv6._end = a;
        else {
            p->ipv6._begin = a;

            /* Set this here in case there is no 'end' address */
            p->ipv6._end = a;
        }
    }

    /* Reset the parser to start parsing the next address */
    _init_next_address(p, 1);

    return 0;
}

/***************************************************************************
 * We store the IPv6 addresses that we are building inside the 'state'
 * of the state-machine. This function copies them out of the opaque
 * state into discrete values.
 ***************************************************************************/
static void
_parser_get_ipv6(struct massip_parser *state, ipv6address *begin, ipv6address *end)
{
    *begin = state->ipv6._begin;
    *end = state->ipv6._end;
}

enum parser_state_t {
    LINE_START, ADDR_START,
    COMMENT,
    NUMBER0, NUMBER1, NUMBER2, NUMBER3, NUMBER_ERR,
    SECOND0, SECOND1, SECOND2, SECOND3, SECOND_ERR,
    IPV4_CIDR_NUM,
    UNIDASH1, UNIDASH2,
    IPV6_BEGIN, IPV6_COLON, IPV6_CIDR, IPV6_CIDR_NUM,
    IPV6_NEXT,
    IPV6_END,
    ERROR
};

/***************************************************************************
 * When we start parsing an address, we don't know whether it's going to 
 * be IPv4 or IPv6. We assume IPv4, but when we hit a condition indicating
 * that it's IPv6 instead, we need change the temporary number we 
 * are working on from decimal to hex, then move from the middle of 
 * parsing an IPv4 address to the middle of parsing an IPv6 address.
 ***************************************************************************/
static int
_switch_to_ipv6(struct massip_parser *p, int old_state)
{
    unsigned num = p->tmp;

    num = ((num/1000)%10) * 16 * 16 * 16
        + ((num/100)%10) * 16 * 16
        + ((num/10)%10) * 16
        + (num % 10);
    
    //printf("%u -> 0x%x\n", p->tmp, num);
    p->tmp = num;
    return old_state;
}


enum {
    IPV4_n, IPV4_nn, IPV4_nnn, IPV4_nnn_, 
    IPV4_nnn_n, IPV4_nnn_nn, IPV4_nnn_nnn, IPV4_nnn_nnn_, 
    IPV4_nnn_nnn_n, IPV4_nnn_nnn_nn, IPV4_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_,
    IPV4_nnn_nnn_nnn_n, IPV4_nnn_nnn_nnn_nn, IPV4_nnn_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_nnn_,
    IPV4e_n, IPV4e_nn, IPV4e_nnn, IPV4e_nnn_, 
    IPV4e_nnn_n, IPV4e_nnn_nn, IPV4e_nnn_nnn, IPV4e_nnn_nnn_, 
    IPV4e_nnn_nnn_n, IPV4e_nnn_nnn_nn, IPV4e_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_,
    IPV4e_nnn_nnn_nnn_n, IPV4e_nnn_nnn_nnn_nn, IPV4e_nnn_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_nnn_,


};


/**
 * Applies a CIDR mask to an IPv4 address to create a begin/end address.
 */
static void
_ipv4_apply_cidr(unsigned *begin, unsigned *end, unsigned bitcount)
{
    unsigned long long mask = 0xFFFFFFFF00000000ULL >> bitcount;
    
    /* mask off low-order bits */
    *begin &= (unsigned)mask;

    /* Set all suffix bits to 1, so that 192.168.1.0/24 has
     * an ending address of 192.168.1.255. */
    *end = *begin | (unsigned)~mask;
}

/**
 * Given an address 'being' and a 'prefix', return the 'begin' and 'end' address of the range.
 * @param begin
 *      An in/out parameter. This may have some extra bits somewhere in the range.
 *      These will be masked off and set to zero when the function returns.
 * @param end
 *      An out parameter. This will be set to the last address of the range, meaning
 *      that all the trailing bits will be set to '1'.
 * @parame prefix
 *      The number of bits of the prefix, from [0..128]. If the value is 0,
 *      then the 'begin' address will be set to all zeroes and the 'end'
 *      address will be set to all ones. If the value is 128,
 *      the 'begin' address is unchanged and the 'end' address
 *      is set to the same as 'begin'.
 */
static void
_ipv6_apply_cidr(ipv6address *begin, ipv6address *end, unsigned prefix)
{
    ipv6address mask;
    
    /* For bad prefixes, make sure we return an invalid address */
    if (prefix > 128) {
        static const ipv6address invalid = {~0ULL, ~0ULL};
        *begin = invalid;
        *end = invalid;
        return;
    };

    /* Create the mask from the prefix */
    if (prefix > 64)
        mask.hi = ~0ULL;
    else if (prefix == 0)
        mask.hi = 0;
    else
        mask.hi = ~0ULL << (64 - prefix);
    
    if (prefix > 64)
        mask.lo = ~0ULL << (128 - prefix);
    else
        mask.lo = 0;

    /* Mask off any non-zero bits from the start
     * TODO print warning */
    begin->hi &= mask.hi;
    begin->lo &= mask.lo;
    
    /* Set all suffix bits to 1, so that 192.168.1.0/24 has
     * an ending address of 192.168.1.255. */
    end->hi = begin->hi | ~mask.hi;
    end->lo = begin->lo | ~mask.lo;
}

/***************************************************************************
 * Parse the next IPv4/IPv6 address from a text stream, using a
 * 'state-machine parser'.
 ***************************************************************************/
static enum {Still_Working, Found_Error, Found_IPv4, Found_IPv6}
_parser_next(struct massip_parser *p, const char *buf, size_t *r_offset, size_t length,
                unsigned *r_begin, unsigned *r_end)
{ 
    size_t i;
    enum parser_state_t state = p->state;
    int result = Still_Working;

    /* The 'offset' parameter is optional. If NULL, then set it to zero */
    if (r_offset)
        i = *r_offset;
    else
        i = 0;

    /* For all bytes in this chunk. This loop will exit early once
     * we've found a complete IP address. */
    while (i < length) {
        unsigned char c = buf[i++];

        p->char_number++;
        switch (state) {
            case LINE_START:
            case ADDR_START:
                _init_next_address(p, 0);
                switch (c) {
                    case ' ': case '\t': case '\r':
                        /* ignore leading whitespace */
                        continue;
                    case '\n':
                        p->line_number++;
                        p->char_number = 0;
                        continue;
                    case '#': case ';': case '/': case '-':
                        state = COMMENT;
                        continue;
                        
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        p->tmp = (c - '0');
                        p->digit_count = 1;
                        state = NUMBER0;
                        break;
                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                        p->tmp = (c - 'a' + 10);
                        p->digit_count = 1;
                        state = IPV6_BEGIN;
                        break;
                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                        p->tmp = (c - 'A' + 10);
                        p->digit_count = 1;
                        state = IPV6_BEGIN;
                        break;
                    case ':':
                        p->ipv6.tmp[p->ipv6.index++] = 0;
                        state = IPV6_COLON;
                        break;
                    case '[':
                        p->ipv6.is_bracket = 1;
                        state = IPV6_BEGIN;
                        break;
                    default:
                        state = ERROR;
                        length = i; /* break out of loop */
                        break;
                }
                break;
            case IPV6_CIDR:
                p->digit_count = 0;
                p->tmp = 0;
                switch (c) {
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        p->tmp = (c - '0');
                        p->digit_count = 1;
                        state = IPV6_CIDR_NUM;
                        break;
                    default:
                        state = ERROR;
                        length = i; /* break out of loop */
                        break;
                }
                break;
                
            case IPV6_COLON:
                p->digit_count = 0;
                p->tmp = 0;
                if (c == ':') {
                    if (p->ipv6.ellision_index < 8) {
                        state = ERROR;
                        length = i;
                    } else {
                        p->ipv6.ellision_index = p->ipv6.index;
                        state = IPV6_COLON;
                    }
                    break;
                }
                state = IPV6_BEGIN;

                /* drop down */
            case IPV6_BEGIN:
            case IPV6_NEXT:
                switch (c) {
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        if (p->digit_count >= 4) {
                            state = ERROR;
                            length = i;
                        } else {
                            p->tmp = p->tmp * 16 + (c - '0');
                            p->digit_count++;
                        }
                        break;
                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                        if (p->digit_count >= 4) {
                            state = ERROR;
                            length = i;
                        } else {
                            p->tmp = p->tmp * 16 + (c - 'a' + 10);
                            p->digit_count++;
                        }
                        break;
                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                        if (p->digit_count >= 4) {
                            state = ERROR;
                            length = i;
                        } else {
                            p->tmp = p->tmp * 16 + (c - 'A' + 10);
                            p->digit_count++;
                        }
                        break;
                    case ':':
                        if (p->ipv6.index >= 8) {
                            state = ERROR;
                            length = i;
                        } else {
                            p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;
                            state = IPV6_COLON;
                        }
                        break;
                    case ']':
                        if (!p->ipv6.is_bracket) {
                            state = ERROR;
                            length = i;
                        } else {
                            state = IPV6_END;
                        }
                        break;
                    case '[':
                        if (p->ipv6.is_bracket) {
                            state = ERROR;
                            length = i;
                        } else {
                            p->ipv6.is_bracket = 1;
                        }
                        break;
                    case '/':
                    case ' ':
                    case '\t':
                    case '\r':
                    case '\n':
                    case ',':
                    case '-':
                        i--; /* push back */
                        state = IPV6_END;
                        continue;
                    default:
                        state = ERROR;
                        length = i;
                        break;
                }
                break;

            case IPV6_END:
                /* Finish off the trailing number */
                p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;

                /* Do the final processing of this IPv6 address and
                 * prepare for the next one */
                if (_parser_finish_ipv6(p) != 0) {
                    state = ERROR;
                    length = i;
                    continue;
                }

                /* Now decide the next state, whether this is a single
                 * address, an address range, or a CIDR address */
                switch (c) {
                    case '/':
                        result = Still_Working;
                        state = IPV6_CIDR;
                        break;
                    case '-':
                        result = Still_Working;
                        state = IPV6_NEXT;
                        break;
                    case '\n':
                        p->line_number++;
                        p->char_number = 0;
                        /* drop down */
                    case ' ':
                    case '\t':
                    case '\r':
                    case ',':
                        result = Found_IPv6;
                        state = 0;
                        length = i; /* shorten the end to break out of loop */
                        break;
                    default:
                        state = ERROR;
                        length = i;
                        break;
                }
                break;
            case COMMENT:
                if (c == '\n') {
                    state = LINE_START;
                    p->line_number++;
                    p->char_number = 0;
                } else
                    state = COMMENT;
                break;
            case IPV6_CIDR_NUM:
                switch (c) {
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        if (p->digit_count == 4) {
                            state = ERROR;
                            length = i; /* break out of loop */
                        } else {
                            p->digit_count++;
                            p->tmp = p->tmp * 10 + (c - '0');
                            if (p->tmp > 128) {
                                state = ERROR;
                                length = i;
                            }
                            continue;
                        }
                        break;
                    case ':':
                    case ',':
                    case ' ':
                    case '\t':
                    case '\r':
                    case '\n':
                        {
                            _ipv6_apply_cidr(&p->ipv6._begin, &p->ipv6._end, p->tmp);

                            state = ADDR_START;
                            length = i; /* break out of loop */
                            if (c == '\n') {
                                p->line_number++;
                                p->char_number = 0;
                            }
                            *r_begin = p->begin;
                            *r_end = p->end;
                            result = Found_IPv6;
                        }
                        break;
                    default:
                        state = ERROR;
                        length = i; /* break out of loop */
                        break;
                }
                break;
            case IPV4_CIDR_NUM:
                switch (c) {
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        if (p->digit_count == 3) {
                            state = ERROR;
                            length = i; /* break out of loop */
                        } else {
                            p->digit_count++;
                            p->tmp = p->tmp * 10 + (c - '0');
                            if (p->tmp > 32) {
                                state = ERROR;
                                length = i;
                            }
                            continue;
                        }
                        break;
                    case ':':
                    case ',':
                    case ' ':
                    case '\t':
                    case '\r':
                    case '\n':
                        {
                            _ipv4_apply_cidr(&p->begin, &p->end, p->tmp);
                            state = ADDR_START;
                            length = i; /* break out of loop */
                            if (c == '\n') {
                                p->line_number++;
                                p->char_number = 0;
                            }
                            *r_begin = p->begin;
                            *r_end = p->end;
                            result = Found_IPv4;
                        }
                        break;
                    default:
                        state = ERROR;
                        length = i; /* break out of loop */
                        break;
                }
                break;

            case UNIDASH1:
                if (c == 0x80)
                    state = UNIDASH2;
                else {
                    state = ERROR;
                    length = i; /* break out of loop */
                }
                break;
            case UNIDASH2:
                /* This covers:
                 * U+2010 HYPHEN
                 * U+2011 NON-BREAKING HYPHEN
                 * U+2012 FIGURE DASH
                 * U+2013 EN DASH
                 * U+2014 EM DASH
                 * U+2015 HORIZONTAL BAR
                 */
                if (c < 0x90 || 0x95 < c) {
                    state = ERROR;
                    length = i; /* break out of loop */
                } else {
                    c = '-';
                    state = NUMBER3;
                    /* drop down */
                }


            case NUMBER0:
            case NUMBER1:
            case NUMBER2:
            case NUMBER3:
            case SECOND0:
            case SECOND1:
            case SECOND2:
            case SECOND3:
                switch (c) {
                    case '.':
                        p->addr = (p->addr << 8) | p->tmp;
                        p->tmp = 0;
                        p->digit_count = 0;
                        if (state == NUMBER3 || state == SECOND3) {
                            length = i;
                            state = ERROR;
                        } else
                            state++;
                        break;
                    case '0': case '1': case '2': case '3': case '4':
                    case '5': case '6': case '7': case '8': case '9':
                        p->digit_count++;
                        p->tmp = p->tmp * 10 + (c - '0');
                        if (p->tmp > 255 || p->digit_count > 3) {
                            if (state == NUMBER0) {
                                /* Assume that we've actually got an
                                 * IPv6 number */
                                _switch_to_ipv6(p, state);
                                state = IPV6_BEGIN;
                            } else {
                                state = ERROR;
                                length = i;
                            }
                        }
                        continue;
                        break;
                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                        if (state == NUMBER0 || state == SECOND0) {
                            /* Assume that we've actually got an
                             * IPv6 number */
                            _switch_to_ipv6(p, state);
                            state = IPV6_BEGIN;
                            i--; /* go back one character */
                        } else {
                            state = ERROR;
                            length = i; /* break out of loop */
                        }
                        break;
                    case 0xe2:
                        if (state == NUMBER3) {
                            state = UNIDASH1;
                        } else {
                            state = ERROR;
                            length = i; /* break out of loop */
                        }
                        break;
                    case '-':
                    case 0x96: /* long dash, comes from copy/pasting into exclude files */
                        if (state == NUMBER3) {
                            p->begin = (p->addr << 8) | p->tmp;
                            p->tmp = 0;
                            p->digit_count = 0;
                            p->addr = 0;
                            state = SECOND0;
                        } else {
                            state = NUMBER_ERR;
                            length = i;
                        }
                        break;
                    case '/':
                        if (state == NUMBER3) {
                            p->begin = (p->addr << 8) | p->tmp;
                            p->tmp = 0;
                            p->digit_count = 0;
                            p->addr = 0;
                            state = IPV4_CIDR_NUM;
                        } else {
                            state = NUMBER_ERR;
                            length = i; /* break out of loop */
                        }
                        break;
                    case ':':
                        if (state == NUMBER0) {
                            /* Assume this is an IPv6 address instead of an IPv4 address */
                            _switch_to_ipv6(p, state);
                            state = IPV6_BEGIN;
                            i--;
                            break;
                        }
                    case ',':
                    case ' ':
                    case '\t':
                    case '\r':
                    case '\n':
                        if (state == NUMBER3) {
                            p->begin = (p->addr << 8) | p->tmp;
                            p->end = p->begin;
                            p->tmp = 0;
                            p->digit_count = 0;
                            p->addr = 0;
                            state = ADDR_START;
                            length = i; /* break out of loop */
                            if (c == '\n') {
                                p->line_number++;
                                p->char_number = 0;
                            }
                            *r_begin = p->begin;
                            *r_end = p->end;
                            result = Found_IPv4;
                        } else if (state == SECOND3) {
                            p->end = (p->addr << 8) | p->tmp;
                            p->tmp = 0;
                            p->digit_count = 0;
                            p->addr = 0;
                            state = ADDR_START;
                            length = i; /* break out of loop */
                            if (c == '\n') {
                                p->line_number++;
                                p->char_number = 0;
                            }
                            *r_begin = p->begin;
                            *r_end = p->end;
                            result = Found_IPv4;
                        } else {
                            state = NUMBER_ERR;
                            length = i;
                        }
                        break;
                    default:
                        state = ERROR;
                        length = i; /* break out of loop */
                        break;
                }
                break;
                
            default:
            case ERROR:
            case NUMBER_ERR:
            case SECOND_ERR:
                state = ERROR;
                length = i; /* break */
                break;
        }
    }

    /* The 'offset' parameter is optional. If NULL, then 
     * we don't return a value */
    if (r_offset)
        *r_offset = i;

    p->state = state;
    if (state == ERROR || state == NUMBER_ERR || state == SECOND_ERR)
        result = Found_Error;
    return result;
}


/***************************************************************************
 * Test errors. We should get exactly which line-number and which character
 * in the line caused the error
 ***************************************************************************/
static int
rangefile_test_error(const char *buf, unsigned long long in_line_number, unsigned long long in_char_number, unsigned which_test)
{
    size_t length = strlen(buf);
    size_t offset = 0;
    struct massip_parser p[1];
    unsigned out_begin = 0xa3a3a3a3;
    unsigned out_end  = 0xa3a3a3a3;
    unsigned long long out_line_number;
    unsigned long long out_char_number;
    int x;

    /* test the entire buffer */
    _parser_init(p);
    x = _parser_next(p, buf, &offset, length, &out_begin, &out_end);
    if (x != Found_Error)
        goto fail;
    _parser_err(p, &out_line_number, &out_char_number);
    if (in_line_number != out_line_number || in_char_number != out_char_number)
        goto fail;

    /* test one byte at a time */
    _parser_destroy(p);
    _parser_init(p);
    offset = 0;
    out_begin = 0xa3a3a3a3;
    out_end  = 0xa3a3a3a3;
    
    x = 0;
    while (offset < length) {
        x = _parser_next(p, buf, &offset, offset+1, &out_begin, &out_end);
        if (x == Found_Error)
            break;
    }
    if (x != Found_Error)
        goto fail;
    _parser_err(p, &out_line_number, &out_char_number);

    if (in_line_number != out_line_number || in_char_number != out_char_number)
        goto fail;

    _parser_destroy(p);
    return 0;
fail:
    _parser_destroy(p);
    fprintf(stderr, "[-] rangefile test fail, line=%u\n", which_test);
    return 1;
}

/***************************************************************************
 ***************************************************************************/
int
massip_parse_file(struct MassIP *massip, const char *filename)
{
    struct RangeList *targets_ipv4 = &massip->ipv4;
    struct Range6List *targets_ipv6 = &massip->ipv6;
    struct massip_parser p[1];
    char buf[65536];
    FILE *fp = NULL;
    bool is_error = false;
    unsigned addr_count = 0;
    unsigned long long line_number, char_number;

    /* Kludge: should never happen, should fix this when reading in
     * config, not this deep in the code. */
    if (filename == 0 || filename[0] == '\0') {
        fprintf(stderr, "[-] missing filename for ranges\n");
        exit(1);
    }

    /*
     * Open the file containing IP addresses, which can potentially be
     * many megabytes in size
     */
    if (strcmp(filename, "-") == 0) {
        fp = stdin;
    } else {
        fp = fopen(filename, "rb");
        if (fp == NULL) {
            fprintf(stderr, "[-] FAIL: parsing IP addresses\n");
            fprintf(stderr, "[-] %s: %s\n", filename, strerror(errno));
            exit(1);
        }
    }

    /*
     * Create a parser for reading in the IP addresses using a state
     * machine parser
     */
    _parser_init(p);

    /*
     * Read in the data a block at a time, parsing according to the state
     * machine.
     */
    while (!is_error) {
        size_t count;
        size_t offset;

        count = fread(buf, 1, sizeof(buf), fp);
        if (count <= 0)
            break;

        offset = 0;
        while (offset < count) {
            unsigned begin, end;
            int err;

            err = _parser_next(p, buf, &offset, count, &begin, &end);
            switch (err) {
            case Still_Working:
                if (offset < count) {
                    /* We reached this somehow in the middle of the buffer, but
                     * this return is only possible at the end of the buffer */
                    fprintf(stderr, "[-] rangeparse_next(): unknown coding failure\n");
                }
                break;
            case Found_Error:
            default:
                _parser_err(p, &line_number, &char_number);
                fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
                is_error = true;
                count = offset;
                break;
            case Found_IPv4:
                rangelist_add_range(targets_ipv4, begin, end);
                addr_count++;
                break;
            case Found_IPv6:
                {
                    ipv6address found_begin, found_end;
                    _parser_get_ipv6(p, &found_begin, &found_end);
                    range6list_add_range(targets_ipv6, found_begin, found_end);
                    addr_count++;
                }
                break;
            }
        }
    }
    
    /* Close the file, unless we are reading from <stdin> */
    if (fp != stdin && fp != NULL)
        fclose(fp);

    /* In case the file doesn't end with a newline '\n', then artificially
     * add one to the end. This is just a repeat of the code above */
    if (!is_error) {
        size_t offset = 0;
        unsigned begin, end;
        int err;
        err = _parser_next(p, "\n", &offset, 1, &begin, &end);
        switch (err) {
        case Still_Working:
                break;
        case Found_Error:
        default:
            _parser_err(p, &line_number, &char_number);
            fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
            is_error = true;
            break;
        case Found_IPv4:
            rangelist_add_range(targets_ipv4, begin, end);
            addr_count++;
            break;
        case Found_IPv6:
            {
                ipv6address found_begin, found_end;
                _parser_get_ipv6(p, &found_begin, &found_end);
                range6list_add_range(targets_ipv6, found_begin, found_end);
                addr_count++;
            }
            break;
        }
    }

    LOG(1, "[+] %s: %u addresses read\n", filename, addr_count);

    /* Target list must be sorted every time it's been changed, 
     * before it can be used */
    rangelist_sort(targets_ipv4);

    if (is_error)
        return -1;  /* fail */
    else
        return 0; /* success*/
}


ipv6address
massip_parse_ipv6(const char *line)
{
    struct massip_parser p[1];
    size_t count = strlen(line);
    size_t offset = 0;
    int err;
    unsigned begin, end;
    ipv6address result;
    ipv6address range;

    _parser_init(p);
    err = _parser_next(p, line, &offset, count, &begin, &end);
again:
    switch (err) {
        case Still_Working:
            if (offset < count) {
                /* We reached this somehow in the middle of the buffer, but
                 * this return is only possible at the end of the buffer */
                fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                goto fail;
            } else {
                err = _parser_next(p, "\n", 0, 1, &begin, &end);
                if (err == Still_Working) {
                    fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                    goto fail;
                } else {
                    goto again;
                }
            }
            break;
        case Found_Error:
        default:
            goto fail;
        case Found_IPv4:
            goto fail;
        case Found_IPv6:
            _parser_get_ipv6(p, &result, &range);
            if (!ipv6address_is_equal(result, range))
                goto fail;
            return result;
    }
fail:
    result.hi = ~0ULL;
    result.lo = ~0ULL;
    return result;
}

unsigned
massip_parse_ipv4(const char *line)
{
    struct massip_parser p[1];
    size_t count = strlen(line);
    size_t offset = 0;
    int err;
    unsigned begin, end;


    _parser_init(p);
    err = _parser_next(p, line, &offset, count, &begin, &end);
again:
    switch (err) {
        case Still_Working:
            if (offset < count) {
                /* We reached this somehow in the middle of the buffer, but
                 * this return is only possible at the end of the buffer */
                fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                goto fail;
            } else {
                err = _parser_next(p, "\n", 0, 1, &begin, &end);
                if (err == Still_Working) {
                    fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                    goto fail;
                } else {
                    goto again;
                }
            }
            break;
        case Found_Error:
        default:
            goto fail;
        case Found_IPv6:
            goto fail;
        case Found_IPv4:
            if (begin != end)
                goto fail;
            return begin;
    }
fail:
    return 0xFFFFFFFF;
}

enum RangeParseResult
massip_parse_range(const char *line, size_t *offset, size_t count, struct Range *ipv4, struct Range6 *ipv6)
{
    struct massip_parser p[1];
    int err;
    unsigned begin, end;
    size_t tmp_offset = 0;
    
    /* The 'count' (length of the string) is an optional parameter. If
     * zero, and also the offset is NULL, then set it to the string length */
    if (count == 0 && offset == NULL)
        count = strlen(line);

    /* The offset is an optional parameter. If NULL, then we set
     * it to point to a value on the stack instead */
    if (offset == NULL)
        offset = &tmp_offset;
    
    /* Create e parser object */
    _parser_init(p);

    /* Parse the next range from the input */
    err = _parser_next(p, line, offset, count, &begin, &end);
again:
    switch (err) {
        case Still_Working:
            if (*offset < count) {
                /* We reached this somehow in the middle of the buffer, but
                 * this return is only possible at the end of the buffer */
                fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                return Bad_Address;
            } else {
                err = _parser_next(p, "\n", 0, 1, &begin, &end);
                if (err == Still_Working) {
                    fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
                    return Bad_Address;
                } else {
                    goto again;
                }
            }
            break;
        case Found_Error:
        default:
            return Bad_Address;
        case Found_IPv4:
            ipv4->begin = begin;
            ipv4->end = end;
            return Ipv4_Address;
        case Found_IPv6:
            _parser_get_ipv6(p, &ipv6->begin, &ipv6->end);
            return Ipv6_Address;
    }
}

/**
 * This tests  parsing when addresses/ranges are specified on the command-line
 * or configuration files, rather than the other test-cases which test parsing
 * when the IP addresses are specified in a file. The thing we are looking for
 * here is specifically when users separate addresses with things like
 * commas and spaces.
 */
static int
selftest_massip_parse_range(void)
{
    struct testcases {
        const char *line;
        union {
            struct Range ipv4;
            struct Range6 ipv6;
        } list[4];
    } cases[] = {
        {"0.0.1.0/24,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
        {"0.0.1.0-0.0.1.255,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
        {"0.0.1.0/24 0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
        {0}
    };
    size_t i;
    
    for (i=0; cases[i].line; i++) {
        size_t length = strlen(cases[i].line);
        size_t offset = 0;
        size_t j = 0;
        struct Range6 range6;
        struct Range range4;
        
        while (offset < length) {
            int x;
            x = massip_parse_range(cases[i].line, &offset, length, &range4, &range6);
            switch (x) {
                default:
                case Bad_Address:
                    fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
                    return 1;
                case Ipv4_Address:
                    if (cases[i].list[j].ipv4.begin != range4.begin
                        || cases[i].list[j].ipv4.end != range4.end) {
                        fprintf(stdout, "[-] %u.%u.%u.%u - %u.%u.%u.%u\n",
                                (unsigned char)(range4.begin>>24),
                                (unsigned char)(range4.begin>>16),
                                (unsigned char)(range4.begin>> 8),
                                (unsigned char)(range4.begin>> 0),
                                (unsigned char)(range4.end>>24),
                                (unsigned char)(range4.end>>16),
                                (unsigned char)(range4.end>> 8),
                                (unsigned char)(range4.end>> 0)
                                );
                        fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
                        return 1;
                    }
                    break;
            }
            j++;
        }
        
        /* Make sure we have found all the expected cases */
        if (cases[i].list[j].ipv4.begin != 0) {
            fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
            return 1;
        }
    }
    return 0;
}


/***************************************************************************
 ***************************************************************************/
static int
rangefile6_test_buffer(struct massip_parser *parser,
                       const char *buf,
                       ipv6address expected_begin,
                       ipv6address expected_end)
{
    size_t length = strlen(buf);
    size_t offset = 0;
    ipv6address found_begin = {1,2};
    ipv6address found_end = {1,2};
    unsigned tmp1, tmp2;
    int err;
    
    /* test the entire buffer */
    err = _parser_next(parser, buf, &offset, length, &tmp1, &tmp2);
    if (err == Still_Working)
        err = _parser_next(parser, "\n", 0, 1, &tmp1, &tmp2);
    switch (err) {
    case Found_IPv6:
        /* Extract the resulting IPv6 address from the state structure */
        _parser_get_ipv6(parser, &found_begin, &found_end);
    
        /* Test to see if the parsed address equals the expected address */
        if (!ipv6address_is_equal(found_begin, expected_begin)) {
            ipaddress_formatted_t fmt1 = ipv6address_fmt(found_begin);
            ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_begin);
            fprintf(stderr, "[-] begin mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
            goto fail;
        }
        if (!ipv6address_is_equal(found_end, expected_end)) {
            ipaddress_formatted_t fmt1 = ipv6address_fmt(found_end);
            ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_end);
            fprintf(stderr, "[-] end mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
            goto fail;
        }
        break;
    case Found_IPv4:
        if (expected_begin.hi != 0 || expected_end.hi != 0)
            goto fail;
        if (tmp1 != expected_begin.lo || tmp2 != expected_end.lo)
            goto fail;
        break;
    case Still_Working:
        /* Found a partial address, which is a normal result in the 
         * real world at buffer boundaries, but which is an error
         * here */
        goto fail;
    case Found_Error:
    default:
        goto fail;
    }

    return 0; /* success */
fail:
    return 1; /* failure */
}

/***************************************************************************
 * List of test cases. Each test case contains three parts:
 * - the string representation of an address, as read from a file, meaning
 *   that it can contain additional things like comment strings
 * - the first address of a range, which in the case of IPv6 addresses
 *   will be two 64-bit numbers, but an IPv4 address have a high-order
 *   number set to zero and the low-order number set to the IPv4 address
 * - the second address of a range, which in the case of individual
 *   addresses, will be equal to the first number
 ***************************************************************************/
struct {
    const char *string;
    ipv6address begin;
    ipv6address end;
} test_cases[] = {
    {"[1::1]/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
    {"1::1/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
    {"[1::1]-[2::3]", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
    {"1::1-2::3", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
    {"[1234:5678:9abc:def0:0fed:cba9:8765:4321]", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
    {"22ab::1", {0x22ab000000000000ULL, 1ULL}, {0x22ab000000000000ULL, 1ULL}},
    {"240e:33c:2:c080:d08:d0e:b53:e74e", {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}, {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}},
    {"2a03:90c0:105::9", {0x2a0390c001050000ULL, 9ULL}, {0x2a0390c001050000ULL, 9ULL}},
    {"2a03:9060:0:400::2", {0x2a03906000000400ULL, 2ULL}, {0x2a03906000000400ULL, 2ULL}},
    {"2c0f:ff00:0:a:face:b00c:0:a7", {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}, {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}},
    {"2a01:5b40:0:4a01:0:e21d:789f:59b1", {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}, {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}},
    {"2001:1200:10::1", {0x2001120000100000ULL, 1ULL}, {0x2001120000100000ULL, 1ULL}},
    {"fec0:0:0:ffff::1", {0xfec000000000ffffULL, 1ULL}, {0xfec000000000ffffULL, 1ULL}},
    {"1234:5678:9abc:def0:0fed:cba9:8765:4321", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
    {"[1111:2222:3333:4444:5555:6666:7777:8888]", {0x1111222233334444ULL, 0x5555666677778888ULL}, {0x1111222233334444ULL, 0x5555666677778888ULL}},
    {"1::1", {0x0001000000000000ULL, 1ULL}, {0x0001000000000000ULL, 1ULL}},
    {"1.2.3.4", {0, 0x01020304}, {0, 0x01020304}},
    {"#test\n  97.86.162.161" "\x96" "97.86.162.175\n", {0, 0x6156a2a1}, {0, 0x6156a2af}},
    {"1.2.3.4/24\n", {0, 0x01020300}, {0, 0x010203ff}},
    {" 1.2.3.4-1.2.3.5\n", {0, 0x01020304}, {0, 0x01020305}},
    {0,{0,0},{0,0}}
};

/***************************************************************************
 * Called during "make test" to run a regression test over this module.
 ***************************************************************************/
int
massip_parse_selftest(void)
{
    int x = 0;
    size_t i;
    struct massip_parser parser[1];

    
    /* Run through the test cases, stopping at the first failure */
    _parser_init(parser);
    for (i=0; test_cases[i].string; i++) {
        x += rangefile6_test_buffer(parser,
                                    test_cases[i].string, 
                                    test_cases[i].begin,
                                    test_cases[i].end);
        if (x) {
            fprintf(stderr, "[-] failed: %u: %s\n", (unsigned)i, test_cases[i].string);
            break;
        }
    }
    _parser_destroy(parser);

    
    /* First, do the single line test */
    x += selftest_massip_parse_range();
    if (x)
        return x;
    

    x += rangefile_test_error("#bad ipv4\n 257.1.1.1\n", 2, 5, __LINE__);
    x += rangefile_test_error("#bad ipv4\n 1.257.1.1.1\n", 2, 6, __LINE__);
    x += rangefile_test_error("#bad ipv4\n 1.10.257.1.1.1\n", 2, 9, __LINE__);
    x += rangefile_test_error("#bad ipv4\n 1.10.255.256.1.1.1\n", 2, 13, __LINE__);
    x += rangefile_test_error("#bad ipv4\n 1.1.1.1.1\n", 2, 9, __LINE__);

    if (x)
       LOG(0, "[-] rangefile_selftest: fail\n");
    return x;
}

