From: Thadeu Lima de Souza Cascardo Date: Wed, 28 Sep 2005 05:05:20 +0000 (+0000) Subject: Compute first set of a grammar X-Git-Tag: cascardo@tlscascardo--private,libgrammatic--lr1--0.1--base-0~10 X-Git-Url: http://git.cascardo.info/?p=cascardo%2Fgrammar.git;a=commitdiff_plain;h=760e66d5887cc5879a2a71d769536ef6fe1e3662 Compute first set of a grammar Compute the first set for every nonterminal of the grammar. git-archimport-id: cascardo@tlscascardo--private/libgrammatic--dev--0.1--patch-8 --- diff --git a/first.c b/first.c new file mode 100644 index 0000000..febcc2e --- /dev/null +++ b/first.c @@ -0,0 +1,497 @@ +#include +#include +#include + +typedef struct +{ + gboolean has_empty; + GHashTable* terminals; + GList* rules; + GList* recursive; +} first_set_t; + +first_set_t* first_set_new () +{ + first_set_t* first_set; + first_set = g_malloc (sizeof (first_set_t)); + first_set->has_empty = FALSE; + first_set->terminals = g_hash_table_new_full (symbol_hash, symbol_equal, + g_free, NULL); + first_set->rules = NULL; + first_set->recursive = NULL; + return first_set; +} + +void first_set_delete (first_set_t* first_set) +{ + GList* l; + g_hash_table_destroy (first_set->terminals); + l = first_set->rules; + while (l != NULL) + { + rule_delete (l->data); + l = g_list_next (l); + } + g_list_free (first_set->rules); + l = first_set->recursive; + while (l != NULL) + { + rule_delete (l->data); + l = g_list_next (l); + } + g_list_free (first_set->recursive); + g_free (first_set); +} + +void first_set_insert_terminal (first_set_t* first_set, symbol_t* symbol) +{ + g_hash_table_insert (first_set->terminals, symbol, NULL); +} + +void first_set_insert (gpointer key, gpointer val, gpointer data) +{ + g_hash_table_insert (data, symbol_copy (key), NULL); +} + +GHashTable* first_set_union (GHashTable* a, GHashTable* b) +{ + g_hash_table_foreach (b, first_set_insert, a); + return a; +} + +void first_advance_recursive (first_set_t* first_set, symbol_t* symbol) +{ + + GList* rules; + rules = first_set->recursive; + + /* + * For each recursive rule, remove recursiveness and insert any new + * rule to list of non-recursive rules. + */ + + while (rules != NULL) + { + + rule_t* rule; + GList* symbols; + symbol_t* first_symbol; + + rule = rules->data; + symbols = grammar_get_rule (rule); + + assert (symbols != NULL); + + first_symbol = symbols->data; + + assert (first_symbol->value == symbol->value); + + /* Remove any leading symbols equal to the recursive nonterminal */ + while (first_symbol->value == symbol->value) + { + first_symbol = rule_pop (rule); + + /* If no symbol remains after removing leading recursive + * nonterminal, it may produce the empty string */ + if (first_symbol == NULL) + { + first_set->has_empty = TRUE; + rule_delete (rule); + break; + } + /* If after removing leading recursive nonterminal, there + * is a terminal, it is in the first set of the nonterminal */ + else if (first_symbol->terminal) + { + first_set_insert_terminal (first_set, + symbol_copy (first_symbol)); + rule_delete (rule); + break; + } + /* + * If after removing leading recursive nonterminal, there + * is a nonterminal, the first of the remaining sequence + * of symbols is in the first set of the recursive + * nonterminal + */ + else if (first_symbol->value != symbol->value) + { + first_set->rules = g_list_append (first_set->rules, rule); + break; + } + } + + rules = g_list_next (rules); + + } + + g_list_free (first_set->recursive); + first_set->recursive = NULL; + +} + +void first_prepare (gpointer key, gpointer val, gpointer data) +{ + + symbol_t* symbol; + GList* rules; + GHashTable* first; + + first_set_t* first_set; + + symbol = (symbol_t*) key; + rules = *(GList**) val; + first = (GHashTable*) data; + + assert (symbol->terminal == FALSE); + + /* For each nonterminal in the grammar, there is a first set */ + first_set = first_set_new (); + g_hash_table_insert (first, symbol_copy (symbol), first_set); + + /* + * For each rule for the nonterminal in the grammar, it will either + * produce the empty string directly, start with a terminal that will + * be included in the first set, start with a different nonterminal or + * be a recursive rule. + */ + while (rules != NULL) + { + + rule_t* rule; + GList* symbols; + symbol_t* first_symbol; + + rule = rules->data; + symbols = grammar_get_rule (rule); + + if (symbols == NULL) + { + first_set->has_empty = TRUE; + } + else + { + first_symbol = symbols->data; + if (first_symbol->terminal) + { + first_set_insert_terminal (first_set, + symbol_copy (first_symbol)); + } + else if (first_symbol->value == symbol->value) + { + first_set->recursive = g_list_prepend (first_set->recursive, + rule_copy (rule)); + } + else + { + first_set->rules = g_list_prepend (first_set->rules, + rule_copy (rule)); + } + } + + rules = g_list_next (rules); + + } + + /* + * If there any no non-recursive rules starting with a nonterminal + * recursiveness may be easily eliminated. + */ + if (first_set->rules == NULL && first_set->recursive != NULL) + { + /* + * If the nonterminal may produce the empty string, each recursive + * rule will lead to a new rule with the leading recursive nonterminal + * removed. + */ + if (first_set->has_empty) + { + first_advance_recursive (first_set, symbol); + } + /* If it does not produce the empty string, every recursive rule + * may be simply removed. All terminals in first set are known at + * this point. + */ + else + { + GList* l; + l = first_set->recursive; + while (l != NULL) + { + rule_delete (l->data); + l = g_list_next (l); + } + g_list_free (first_set->recursive); + first_set->recursive = NULL; + } + assert (first_set->recursive == NULL); + } + +} + +void first_iterate (gpointer key, gpointer val, gpointer data) +{ + + symbol_t* symbol; + first_set_t* first_set; + GHashTable* first; + + GList* rules; + GList* new_rules; + + symbol = (symbol_t*) key; + first_set = (first_set_t*) val; + first = (GHashTable*) data; + + assert (symbol->terminal == FALSE); + + /* If there are no non-recursive rules starting with a + * nonterminal, there's not much to do. Only remove all + * the recursive rules or advance them as described in + * the first_prepare. + */ + if (first_set->rules == NULL) + { + if (first_set->recursive != NULL) + { + if (first_set->has_empty) + { + first_advance_recursive (first_set, symbol); + } + else + { + GList* l; + l = first_set->recursive; + while (l != NULL) + { + rule_delete (l->data); + l = g_list_next (l); + } + g_list_free (first_set->recursive); + first_set->recursive = NULL; + } + assert (first_set->recursive == NULL); + } + return; + } + else + { + + rules = first_set->rules; + new_rules = NULL; + + /* + * For each rule, try to eliminate it by checking the first set of + * the nonterminal with starts it. Rules may have been added by + * previous iterations or the advancement of recursive rules. + * So, trivial checks like empty rules and rules starting with + * terminals should be checked too. + */ + while (rules != NULL) + { + + rule_t* rule; + GList* symbols; + symbol_t* first_symbol; + + rule = rules->data; + symbols = grammar_get_rule (rule); + + /* If it is an empty rule, nonterminal may produce the empty + * string */ + if (symbols == NULL) + { + first_set->has_empty = TRUE; + rule_delete (rule); + } + else + { + + first_set_t* next_set; + + first_symbol = symbols->data; + + /* If it starts with a terminal, include it in the first + * set and remove the rule. */ + if (first_symbol->terminal) + { + first_set_insert_terminal (first_set, + symbol_copy (first_symbol)); + rule_delete (rule); + } + /* Get the first set of the nonterminal which starts the rule */ + else if (g_hash_table_lookup_extended (first, first_symbol, NULL, + (gpointer*) &next_set)) + { + /* + * If nonterminal produces the empty string, the first set + * of the sequence of symbols next to it is in the first + * set of the nonterminal we are treating. Add a new rule + * corresponding to this sequence of symbols. This is the + * rule which says: if there is a production A -> y1 y2 yk + * and the empty string is in the first of y1, then first + * of y2 yk is in first of A. + */ + if (next_set->has_empty) + { + rule_t* new_rule; + new_rule = rule_copy (rule); + rule_pop (new_rule); + new_rules = g_list_prepend (new_rules, new_rule); + } + /* The terminals in the first of the leading nonterminal in + * the rule are in the first of our current nonterminal */ + first_set_union (first_set->terminals, + next_set->terminals); + /* + * If there are other rules starting with a nonterminal + * for this nonterminal, they should be copied to the + * current nonterminal, so we can detect and treat indirect + * recursiveness properly. + */ + if (next_set->rules != NULL) + { + /* FIXME: not much of an advance here, watch for + * this in a possible unstopabble loop */ + /* FIXME: copy recursive rules too, with care */ + GList* next_rules; + next_rules = next_set->rules; + while (next_rules != NULL) + { + rule_t* next_rule = rule_copy (next_rules->data); + GList* next_symbols; + next_symbols = grammar_get_rule (next_rule); + /* + * If the rule is empty, both terminals produce the + * empty set. This will be detected for the other + * nonterminal later. Optimization could be done + * here. TODO + */ + if (next_symbols == NULL) + { + first_set->has_empty = TRUE; + rule_delete (next_rule); + } + else + { + symbol_t* next_symbol; + next_symbol = next_symbols->data; + /* TODO: Check this assertion is correct. */ + assert (next_symbol->terminal == FALSE); + /* This is an indirect recursive rule. */ + if (next_symbol->value == symbol->value) + { + first_set->recursive = + g_list_prepend (first_set->recursive, + next_rule); + } + /* Next time we treat current nonterminal, + * we will check this rule directly. */ + else + { + new_rules = g_list_prepend (new_rules, next_rule); + } + } + next_rules = g_list_next (next_rules); + } + } + /* + * If nonterminal has recursive rules, we should expect + * more rules to add here and so we will check it again + * in the next iteration. Keep the rule. If there is only + * indirect recursiveness, we will detect it through the + * rules we have added before. We should remove it in this + * case so we make advance. + */ + if (next_set->recursive != NULL) + { + new_rules = g_list_prepend (new_rules, rule); + } + else + { + rule_delete (rule); + } + } + else + { + assert (TRUE); + } + + } + + rules = g_list_next (rules); + + } + + g_list_free (first_set->rules); + first_set->rules = new_rules; + + } + +} + +void first_check (gpointer key, gpointer val, gpointer data) +{ + + symbol_t* symbol; + first_set_t* first_set; + gboolean* stop; + + symbol = (symbol_t*) key; + first_set = (first_set_t*) val; + stop = (gboolean*) data; + + assert (symbol->terminal == FALSE); + + /* If there is anything besides terminals in the first set, + * we must iterate more and remove all of them. */ + if (first_set->rules != NULL || first_set->recursive != NULL) + { + *stop = FALSE; + } + +} + +/* + * Calculate the first set for every nonterminal symbol in the grammar. + * We should iterate through the rules for each nonterminal until only + * terminals are known to be in the first set of it. + */ +GHashTable* grammar_first (Grammar* grammar) +{ + GHashTable* first; + gboolean stop; + first = g_hash_table_new_full (symbol_hash, symbol_equal, + g_free, + first_set_delete); + g_hash_table_foreach (grammar->grammar, first_prepare, first); + do + { + g_hash_table_foreach (first, first_iterate, first); + stop = TRUE; + g_hash_table_foreach (first, first_check, &stop); + } while (stop == FALSE); + return first; +} + +void put_key_on_list (gpointer key, gpointer val, gpointer data) +{ + GList** l; + l = (GList**) l; + *l = g_list_prepend (*l, key); +} + +GList* first_get (GHashTable* first, symbol_t* symbol) +{ + + first_set_t* first_set; + GList* l; + l = NULL; + if (g_hash_table_lookup_extended (first, symbol, NULL, + (gpointer*) &first_set)) + { + g_hash_table_foreach (first->terminals, put_key_on_list, &l); + } + return l; + +} diff --git a/first.h b/first.h new file mode 100644 index 0000000..1f0794c --- /dev/null +++ b/first.h @@ -0,0 +1,9 @@ +#ifndef FIRST_H +#define FIRST_H + +#include + +GHashTable* grammar_first (Grammar*); +GList* first_get (GHashTable*, symbol_t*); + +#endif