# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Dict, Optional, Tuple, Union
import math

import torch.nn as nn
from mmengine.config import Config, ConfigDict
from mmengine.model import BaseModel
from peft import get_peft_model, prepare_model_for_kbit_training

from xtuner.registry import BUILDER
from .modules import ProjectorConfig, ProjectorModel, dispatch_modules
from .utils import (LoadWoInit, find_all_linear_names,
                    get_peft_model_state_dict,
                    make_inputs_require_grad,
                    prepare_inputs_labels_for_multimodal, traverse_dict)


class LLaMASpeechModel(BaseModel):

    def __init__(self,
                 llm,
                 speech_encoder,
                 freeze_llm=False,
                 freeze_speech_encoder=False,
                 projector_depth=2,
                 pooling=-1,
                 llm_lora=None,
                 speech_encoder_lora=None,
                 use_activation_checkpointing=True,
                 model_name='llama2'):
        super().__init__()
        self.freeze_llm = freeze_llm
        self.freeze_speech_encoder = freeze_speech_encoder
        with LoadWoInit():
            self.llm = self._build_from_cfg_or_module(llm)
            self.speech_encoder = self._build_from_cfg_or_module(
                speech_encoder)
            # only whisper encoder lah
            # del self.speech_encoder.decoder
        self.llm.config.use_cache = False
        dispatch_modules(self.llm)

        projector_config = ProjectorConfig(
            visual_hidden_size=self.speech_encoder.config.hidden_size,
            llm_hidden_size=self.llm.config.hidden_size,
            pooling=pooling,
            depth=projector_depth)
        self.pooling = pooling
        self.projector = ProjectorModel(projector_config).to(
            self.speech_encoder.dtype)

        if self.freeze_llm:
            self.llm.requires_grad_(False)
        if self.freeze_speech_encoder:
            self.speech_encoder.requires_grad_(False)

        if use_activation_checkpointing:
            # For backward compatibility
            if hasattr(self.llm, 'enable_input_require_grads'):
                self.llm.enable_input_require_grads()
            else:
                self.llm.get_input_embeddings().register_forward_hook(
                    make_inputs_require_grad)
            if hasattr(self.speech_encoder, 'enable_input_require_grads'):
                self.speech_encoder.enable_input_require_grads()
            else:
                self.speech_encoder.get_input_embeddings(
                ).register_forward_hook(make_inputs_require_grad)
            self.projector.enable_input_require_grads()

            # enable gradient (activation) checkpointing for memory efficiency
            self.gradient_checkpointing_enable()

        self.use_llm_lora = llm_lora is not None
        self.use_speech_encoder_lora = speech_encoder_lora is not None

        if self.use_llm_lora:
            self._prepare_llm_for_lora(llm_lora, use_activation_checkpointing)
        if self.use_speech_encoder_lora:
            self._prepare_speech_encoder_for_lora(
                speech_encoder_lora, use_activation_checkpointing)

        self._is_init = True
        self.model_name = model_name
        assert self.model_name in ['llama2', 'llama3']

    def _parse_lora_config(self, lora_config):
        if isinstance(lora_config, dict) or isinstance(
                lora_config, Config) or isinstance(lora_config, ConfigDict):
            lora_config = BUILDER.build(lora_config)
        return lora_config

    def _prepare_llm_for_lora(self,
                              lora_config,
                              use_activation_checkpointing=True):
        lora_config = self._parse_lora_config(lora_config)
        self.llm = prepare_model_for_kbit_training(
            self.llm, use_activation_checkpointing)
        if lora_config.target_modules is None:
            modules = find_all_linear_names(self.llm)
            lora_config.target_modules = modules
        self.llm = get_peft_model(self.llm, lora_config)

    def _prepare_speech_encoder_for_lora(self,
                                         lora_config,
                                         use_activation_checkpointing=True):
        lora_config = self._parse_lora_config(lora_config)
        if lora_config.target_modules is None:
            modules = find_all_linear_names(self.speech_encoder)
            lora_config.target_modules = modules
        self.speech_encoder = get_peft_model(self.speech_encoder, lora_config)

    def gradient_checkpointing_enable(self):
        self.activation_checkpointing_enable()

    def activation_checkpointing_enable(self):
        self.llm.gradient_checkpointing_enable()
        self.speech_encoder.gradient_checkpointing_enable()
        self.projector.gradient_checkpointing_enable()

    def gradient_checkpointing_disable(self):
        self.activation_checkpointing_disable()

    def activation_checkpointing_disable(self):
        self.llm.gradient_checkpointing_disable()
        self.speech_encoder.gradient_checkpointing_disable()
        self.projector.gradient_checkpointing_disable()

    def init_weights(self):
        pass

    def state_dict(self, *args, **kwargs):
        state_dict = super().state_dict(*args, **kwargs)
        to_return = OrderedDict()
        # Step 1. speech_encoder
        if self.use_speech_encoder_lora:
            to_return.update(
                get_peft_model_state_dict(
                    self.speech_encoder, state_dict=state_dict))
        elif not self.freeze_speech_encoder:
            to_return.update({
                k: v
                for k, v in state_dict.items() if 'speech_encoder.' in k
            })
        # Step 2. LLM
        if self.use_llm_lora:
            to_return.update(
                get_peft_model_state_dict(self.llm, state_dict=state_dict))
        elif not self.freeze_llm:
            to_return.update(
                {k: v
                 for k, v in state_dict.items() if 'llm.' in k})
        # Step 3. Projector
        to_return.update(
            {k: v
             for k, v in state_dict.items() if 'projector.' in k})
        return to_return

    def _build_from_cfg_or_module(self, cfg_or_mod):
        if isinstance(cfg_or_mod, nn.Module):
            return cfg_or_mod
        elif isinstance(cfg_or_mod, dict):
            traverse_dict(cfg_or_mod)
            return BUILDER.build(cfg_or_mod)
        else:
            raise NotImplementedError

    def forward(self, data, data_samples=None, mode='loss'):
        if 'speech_repr' in data:
            speech_outputs = self.forward_whisper_encoder(data['speech_repr'])
            speech_repr = self.projector(speech_outputs)
            data['speech_repr'] = speech_repr
            data['repr_lens'] = [math.ceil(l / self.pooling) for l in  data['repr_lens']]
            data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data)

        if mode == 'loss':
            return self.compute_loss(data, data_samples)
        elif mode == 'predict':
            return self.predict(data, data_samples)
        elif mode == 'tensor':
            return self._forward(data, data_samples)
        else:
            raise NotImplementedError

    def forward_whisper_encoder(self, mel_feature):
        for p in self.speech_encoder.parameters(): model_dtype = p.dtype; break
        if hasattr(self.speech_encoder.base_model, "model"):
            return self.speech_encoder.base_model.model.encoder(mel_feature.type(model_dtype)).last_hidden_state
        else:
            return self.speech_encoder.base_model.encoder(mel_feature.type(model_dtype)).last_hidden_state

    def _forward(self, data, data_samples=None):

        outputs = self.llm(**data)

        return outputs

    def predict(self, data, data_samples=None):
        outputs = self.llm(**data)
        logits_dict = [{'logits': logits, 'loss': outputs.loss} for logits in outputs.logits]
        return logits_dict

    def compute_loss(self, data, data_samples=None):
        outputs = self.llm(**data)
        loss_dict = {'loss': outputs.loss}
        return loss_dict

    def __getattr__(self, name: str):
        try:
            return super().__getattr__(name)
        except AttributeError:
            return getattr(self.llm, name)
