import pandas as pd
import numpy as np
import re

def parse_file(file_path):
    '''
    Parameters
    ----------
    file_path : a path to a csv file encoding a full tableau

    Returns
    -------
    CONS : a list of constraints
    viols : a list of constraint violations for each ur_sr_pair
    ur_sr_pair : a list of tuple (ur, sr, word forms)
    obs : a dictionary of observations, with keys to be a word form, and values to be a dictionary of {sr:observed frequency pair}
    thetas: a dictionary which stores the UR probability {morph:{ur:P(ur|morph)}}; For each morpheme, possible URs are initialized to be equi-probable
    data : a dictionary, with keys to be a word form, and values to be another dictionary of {ur: {sr: violation}}
    '''
    tableaux_csv=pd.read_csv(file_path).fillna(0)
    tableaux_list = tableaux_csv.values.tolist()
    CONS = tableaux_list[0][5:]
    
    viols = []
    ur_sr_pair = []
    obs = {}   
    ur_lists = {}
    data = {}

    for forms in tableaux_list[1:]:
        ur_sr_pair = ur_sr_pair+[tuple(forms[3:5]+[forms[1]])]
        viols = viols+[forms[5:]]
        obs[forms[1]] = {}
        ur_lists[forms[0]] ={}
        data[forms[1]] ={}


    for forms in tableaux_list[1:]:
        obs[forms[1]][forms[4]] = float(forms[2])
        ur_lists[forms[0]][forms[1]] = {}
        data[forms[1]][forms[3]] ={}
        
    for forms in tableaux_list[1:]:
        ur_lists[forms[0]][forms[1]][forms[3]] = 1
        data[forms[1]][forms[3]][forms[4]] = np.array(forms[5:]).astype('int')
    
    thetas = {}
    storage = []
    for morph, forms in ur_lists.items():
        thetas[morph] = {}
        for single_form, ur_freq in forms.items():
            morphemes = re.split('_', single_form)
            suffixes = re.split('_', single_form)[1:]
            for ur, values in ur_freq.items():
                urs = [i for i in re.split('-|/|_', ur) if i != '']
                for i in range(len(morphemes)):
                    if morphemes[i][0].isupper():
                        prefixes = morphemes[0:i]
                        suffixes = morphemes[i+1:]
                        stem = morphemes[i]
                        ur_prefixes = ['/'+x+'/' for x in urs[0:i]]
                        ur_suffixes = ['/'+x+'/' for x in urs[i+1:]]
                        ur_stem = '/'+urs[i]+'/'
                        thetas [stem][ur_stem] = 1
                        ur_and_orth_prefix = list(zip(prefixes, ur_prefixes))
                        ur_and_orth_suffix = list(zip(suffixes, ur_suffixes))
                        ur_and_orth = ur_and_orth_prefix + ur_and_orth_suffix 
                        for items in ur_and_orth:
                            if items[0] not in thetas.keys():
                                thetas[items[0]] ={}
                        for items in ur_and_orth:
                            thetas[items[0]][items[1]] =1     
    
    for morph, urs in thetas.items():
        denom = len(urs)
        for ur, values in urs.items():
            thetas[morph][ur]=1/denom
            storage.append([morph, ur, 1/denom])
    return (CONS, viols, ur_sr_pair, obs, (thetas,storage), data) 



def linguist_urs(file_path):
    '''
    Parameters
    ----------
    file_path : a path to a csv file encoding the linguist URs

    Returns
    -------
    gold_urs: same information with the dictionary structure
    '''
    table =(pd.read_csv(file_path)).values.tolist()
    gold_urs = {}
    for i in table:
        gold_urs[i[0]] = i[1]
    return gold_urs
