Re: [spam] Non-Canon Traffick Boss Spinoffs Reboot 3

user Sat, 07 Dec 2024 18:51:32 -0800

i saw somebody else with a large prosthesis and again engaged some prosthesis 
project energy :D


i've learned more about dissociation and i took it very slowly and made very 
tiny moving-forward progress :}

i made 11 practices of very simple metamodel behavior (holding idea of a 
prosthetic limb could use a low-end chip to learn to adapt to an owner's 
changing signals some day) [metamodel work is the hardest part of the project 
design for me]

attached is the dependency file i made toward the end. grad output untested but 
i've figured out how to make it work if it doesn't.
the grid search was just to figure out what hyperparameters could make a 
successful sin(x) model (it was my synthesis test), i didn't do any other grid 
search yet

this uses a high-level class from torch.nn but i have since learned there is 
more now in the newer torchtune package. but its one way to abstract the idea 
away.

the make_settable() etc methods let one use the weights of one model in the 
training graph of another without crashing pytorch by letting the user specify 
when this is or isn't happening and what the role is

it's just been very hard for me to try any of this at all up until now

import functools, itertools, random
import torch, tqdm

class TBasic(torch.nn.Module):
    V = 1
    def __init__(self, d_model, n_heads, d_ff, n_layers=1, d_in=None, 
d_out=None, dropout=0, device='cpu', dtype=torch.float32):
        super().__init__()
        mismatched_params = d_model % n_heads
        if mismatched_params:
            d_model += n_heads - mismatched_params
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.n_layers = n_layers
        self.d_in = d_in
        self.d_out = d_out
        self.dtype = dtype
        self.device = device
        self.l_in = None if d_in is None else torch.nn.Linear(d_in, d_model, 
device=device, dtype=dtype)
        self.t = torch.nn.TransformerEncoderLayer(d_model=d_model, 
nhead=n_heads, dim_feedforward=d_ff, dropout=dropout, device=device, 
dtype=dtype, batch_first=True)
        self.l_out = None if d_out is None else torch.nn.Linear(d_model, d_out, 
device=device, dtype=dtype)
        self.__params = [
            [fqn, functools.reduce(getattr, ns[:-1], self), ns[-1]]
            for fqn, v in super().named_parameters()
            for ns in [fqn.split('.')]
        ]
    def make_settable(self):
        for fqn, o, n in self.__params:
            v = getattr(o, n)
            v = v.clone()
            delattr(o, n)
            v.retain_grad()
            setattr(o, n, v)
        return self
    def make_trainable(self):
        for fqn, o, n in self.__params:
            v = getattr(o, n)
            v = torch.nn.Parameter(v)
            setattr(o, n, v)
        return self
    def get(self, include_grad=False, flatten=True):
        # feel free to add a condition block to provide some certain form of 
these
        if flatten:
            if not include_grad:
                return torch.cat([
                    getattr(o, n).flatten()
                    for fqn, o, n in self.__params
                ])
            else:
                return torch.cat([
                    torch.stack([
                        getattr(o, n).flatten(),
                        getattr(o, n).grad.flatten()
                    ])
                    for fqn, o, n in self.__params
                ], dim=-1).T
    def set(self, ps):
        of = 0
        for fqn, o, n in self.__params:
            v = getattr(o, n)
            of2 = of + len(v.flatten())
            v = ps[of:of2].view(v.shape)
            v.retain_grad()
            setattr(o, n, v)
            of = of2
        assert of == len(ps)
    def parameters(self):
        return [getattr(o, n) for fqn, o, n in self.__params]
    def named_parameters(self):
        return [[fqn, getattr(o, n)] for fqn, o, n in self.__params]
    def forward(self, data = None):
        if data is None:
            data = torch.empty([1,0],dtype=self.dtype)
        else:
            data = data.to(self.dtype)
        if self.l_in:
            if self.l_in.weight.shape[-1] == 1:
                data = data[...,None]
            data = self.l_in(data)
        data = self.t(data)
        if self.l_out:
            data = self.l_out(data)
            if self.l_out.weight.shape[0] == 1:
                data = data[...,0]
        return data

class TGroups(TBasic):
    # these are sequential groups.
    # each group describes a set of sequence items
    # each item has so many floats or ids and there may be trained embeddings
    # pass name=dict(trained_len=[0], trained_dim=[d_model], floats=[0], 
ids=[0], id_dim=[d_model], out=[1])
    def __init__(self, **kwparams):
        groups = {
            k:kwparams.pop(k)
            for k,v in list(kwparams.items())
            if type(v) is dict
        }
        super().__init__(**kwparams, d_in=None, d_out=None)
        self.groups = {}
        for name, kws in groups.items():
            traineds = kws.get('trained_len', 0)
            trained_embeds = kws.get('trained_dim', self.d_model)
            floats = kws.get('floats', 0)
            ids = kws.get('ids', 0)
            id_embeds = kws.get('id_dim', self.d_model)
            out = kws.get('out', 1)
            in_size = floats
            if ids:
                embedding = torch.nn.Embedding(ids, id_embeds)
                setattr(self, name + '_embed', embedding)
                in_size += id_embeds
            else:
                embedding = None
            if traineds:
                trained = torch.nn.Parameter(torch.rand([traineds, 
trained_embeds], device=self.device, dtype=self.dtype))
                setattr(self, name + '_trained', trained)
                in_size += trained_embeds
            else:
                trained = None
            l_in = torch.nn.Linear(in_size, self.d_model)
            setattr(self, name + '_in', l_in)
            l_out = torch.nn.Linear(self.d_model, out)
            setattr(self, name + '_out', l_out)
            self.groups[name] = {
                'trained': trained,
                'n_floats': floats,
                'embed': embedding,
                'in': l_in,
                'out': l_out,
            }
    def forward(self, **kwparams):
        data = []
        off = 0
        groups = []
        for name, kws in self.groups.items():
            trained = kws['trained']
            embed = kws['embed']
            n_floats = kws['n_floats']
            l_in = kws['in']
            l_out = kws['out']
            gdata = trained
            if n_floats:
                floats = kwparams.get(name)
                if floats is None:
                    floats = kwparams[name + '_floats']
                gdata = torch.cat(gdata, floats, dim=-1)
            if embed:
                ids = kwparams.get(name)
                if ids is None:
                    ids = kwparams[name + '_ids']
                embeds = embed(ids)
                gdata = torch.cat(gdata, embeds, dim=-1)
            gdata = l_in(gdata)
            data.append(gdata)
            off2 = off + gdata.shape[-2]
            groups.append([name, off, off2, l_out])
            off = off2
        data = self.t(torch.cat(data, dim=-2))
        return {
            name:
                out(data[off:off2]) if out.weight.shape[0] > 1
                else out(data[off:off2])[...,0]
            for name, off, off2, out in groups
        }
            
            #data = l_in(data)
            #data = self.t(data)
            #data = l_out(data)
            #if data.shape[-1] == 1:
            #    data = data[...,0]
        

def sin_cos_pos_embeds(seq_len, n_embeds, dtype, device):
    n_spectra = (n_embeds+1) // 2
    #thetas = torch.linspace(0, 2*torch.pi, seq_len+1, dtype=dtype, 
device=device)[:-1]
    #scales = 2**torch.arange(n_spectra, dtype=dtype, device=device)
    thetas = torch.arange(seq_len, dtype=dtype, device=device)
    scales = 2**(-torch.arange(n_spectra, dtype=dtype, device=device))
    embeds = torch.outer(scales, thetas)
    embeds = 
torch.cat([torch.sin(embeds),torch.cos(embeds)],dim=-1).view(n_spectra*2, 
seq_len)
    return embeds.T[:,:n_embeds]

optims = [torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW]

def flat_grid_search(data, labels, **kw_min_max_steps_proc):
    key_proc_lists = [
        [key, proc, torch.linspace(min,max,steps)[torch.randperm(steps)]]
        for key, min_max_steps_proc in kw_min_max_steps_proc.items()
        for min, max, steps, proc in [min_max_steps_proc]
    ]
    # it would be nice to do a few steps of all of them, then continue
    # this i suppose would mean caching all the objects and states
    all_combinations = list(itertools.product(*[list for key,proc,list in 
key_proc_lists]))
    random.shuffle(all_combinations)
    #all_combinations = all_combinations[torch.randperm(len(all_combinations))]
    for values in all_combinations:
        kwparams = {
            key: proc(values[idx])
            for idx in range(len(values))
            for key, proc, list in [key_proc_lists[idx]]
        }
        dtype = kwparams.get('dtype', data.dtype)
        data = data.to(dtype)
        labels = labels.to(dtype)
        lr = kwparams.pop('lr', 1e-7)
        steps = kwparams.pop('steps', 100)
        optim = kwparams.pop('optim', optims[0])
        if 'd_model_per_head' in kwparams:
            kwparams['d_model'] = kwparams.pop('d_model_per_head') * 
kwparams['n_heads']
        t = TBasic(d_in = data.shape[-1], d_out = labels.shape[-1], **kwparams)
        optim_obj = optim(t.parameters(), lr=lr)
        for s in range(steps):
            l = torch.nn.functional.mse_loss(t(data), labels)
            l.backward()
            optim_obj.step()
            optim_obj.zero_grad()
        kwparams['lr'] = lr
        kwparams['optim'] = optim
        kwparams['steps'] = steps
        yield [l.item(), kwparams]

def make_sin_arch():
    # somewhat arbitrary
    #lr = 0.003162277629598975
    #return TBasic(d_in=1, d_out=1, d_ff=35, n_heads=4, n_layers=2, 
dtype=torch.float64, d_model=12, dropout=0)
    lr = 0.01
    return TBasic(d_in=1, d_out=1, d_ff=25, n_heads=4, n_layers=1, 
dtype=torch.float32, d_model=4, dropout=0)
# d_ff=35 n_heads=4 n_layers=2 dtype=torch.float64 d_model=12
    # can it predict its loss?

if __name__ == '__main__':
    data = torch.linspace(0, 3, 256)[...,None]
    labels = torch.sin(data)
    results = []
    for result in flat_grid_search(
        data=data,
        labels=labels,
        #optim=[0,len(optims)-1,len(optims),lambda idx: optims[int(idx)]],
        optim=[2,2,1,lambda idx: optims[int(idx)]], # adamw
        steps=[500,933,4,int],#[50,200,3,int],
        lr=[-2,-3,3,lambda x:10**x],
        d_model_per_head=[1,3,3,int],
        d_ff=[25,35,3,int],
        n_heads=[3,5,3,int],
        n_layers=[1,2,2,int],
        dropout=[0,0,1,float],#[0,0.025,2,float],
        dtype=[0,1,2,lambda idx: [torch.float32,torch.float64][int(idx)]],
    ):
        results.append(result)
        if len(results) % 4 != 0:
            continue
        results.sort()
        numbered = list(enumerate(results))
        print()
        for idx, r in numbered[:3] + numbered[-3:]:
            l, kwp = r
            print(idx, l, end=' ')
            for k, v in kwp.items():
                print(f'{k}:{v} ; ', end='')
            print()

Re: [spam] Non-Canon Traffick Boss Spinoffs Reboot 3

Reply via email to