@@ -345,7 +345,7 @@ def mtmd_bitmap_init(
345345)
346346def mtmd_bitmap_init_from_audio (
347347 n_samples : c_uint ,
348- data : POINTER (c_float ),
348+ data : POINTER (c_float ), # type: ignore
349349 / ,
350350) -> mtmd_bitmap_p :
351351 ...
@@ -582,6 +582,9 @@ class mtmd_decoder_pos(Structure):
582582 x : c_uint32
583583 y : c_uint32
584584
585+ mtmd_decoder_pos_p = POINTER (mtmd_decoder_pos )
586+ mtmd_decoder_pos_p_ctypes = c_void_p
587+
585588# // get position for decoder attention, to be used by M-RoPE models
586589# // i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
587590# // return relative position (for example, embedding 0 will have position (0, 0, 0);
@@ -633,7 +636,7 @@ def mtmd_tokenize(
633636 ctx : mtmd_context_p ,
634637 output : mtmd_input_chunks_p ,
635638 text : mtmd_input_text_p ,
636- bitmaps : POINTER (mtmd_bitmap_p ),
639+ bitmaps : POINTER (mtmd_bitmap_p ), # type: ignore
637640 n_bitmaps : c_uint ,
638641 / ,
639642) -> c_int32 :
@@ -691,7 +694,7 @@ def mtmd_encode_chunk(
691694# MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
692695@ctypes_function_mtmd (
693696 "mtmd_get_output_embd" , [mtmd_context_p_ctypes ], POINTER (c_float ))
694- def mtmd_get_output_embd (ctx : mtmd_context_p ) -> POINTER (c_float ):
697+ def mtmd_get_output_embd (ctx : mtmd_context_p ) -> POINTER (c_float ): # type: ignore
695698 """
696699 get output embeddings from the last encode pass
697700 """
@@ -703,7 +706,7 @@ def mtmd_get_output_embd(ctx: mtmd_context_p) -> POINTER(c_float):
703706# MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
704707@ctypes_function_mtmd (
705708 "mtmd_log_set" , [ggml_log_callback , c_void_p ], None )
706- def mtmd_log_set (log_callback : ggml_log_callback , user_data : c_void_p ):
709+ def mtmd_log_set (log_callback : ggml_log_callback , user_data : c_void_p ): # type: ignore
707710 """
708711 Set callback for all future logging events.
709712 """
@@ -735,7 +738,7 @@ def mtmd_test_create_input_chunks() -> mtmd_input_chunk_p:
735738# MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
736739@ctypes_function_mtmd (
737740 "mtmd_helper_log_set" , [ggml_log_callback , c_void_p ], None )
738- def mtmd_helper_log_set (log_callback : ggml_log_callback , user_data : c_void_p ):
741+ def mtmd_helper_log_set (log_callback : ggml_log_callback , user_data : c_void_p ): # type: ignore
739742 """
740743 Set callback for all future logging events.
741744 """
@@ -810,6 +813,25 @@ def mtmd_helper_get_n_pos(chunks: mtmd_input_chunk_p) -> c_int32:
810813 ...
811814
812815
816+ # // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
817+ # // out_pos must have length == mtmd_helper_get_n_tokens(image)
818+ # MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
819+ @ctypes_function_mtmd ("mtmd_helper_image_get_decoder_pos" , [
820+ mtmd_image_tokens_p_ctypes ,
821+ mtmd_decoder_pos_p_ctypes
822+ ],
823+ None )
824+ def mtmd_helper_image_get_decoder_pos (
825+ image : mtmd_image_tokens_p ,
826+ out_pos : mtmd_decoder_pos_p # type: ignore
827+ ) -> c_int32 :
828+ """
829+ helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
830+ out_pos must have length == mtmd_helper_get_n_tokens(image)
831+ """
832+ ...
833+
834+
813835# // helper function that automatically:
814836# // 1. run llama_decode() on text chunks
815837# // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
@@ -844,7 +866,7 @@ def mtmd_helper_eval_chunks(
844866 seq_id : c_int32 ,
845867 n_batch : c_int32 ,
846868 logits_last : c_bool ,
847- new_n_past : POINTER (c_int32 ),
869+ new_n_past : POINTER (c_int32 ), # type: ignore
848870 / ,
849871) -> c_int32 :
850872 """
@@ -887,7 +909,7 @@ def mtmd_helper_eval_chunk_single(
887909 seq_id : c_int32 ,
888910 n_batch : c_int32 ,
889911 logits_last : c_bool ,
890- new_n_past : POINTER (c_int32 ),
912+ new_n_past : POINTER (c_int32 ), # type: ignore
891913 / ,
892914) -> c_int32 :
893915 """
@@ -923,7 +945,7 @@ def mtmd_helper_decode_image_chunk(
923945 ctx : mtmd_context_p ,
924946 lctx : llama_cpp .llama_context_p ,
925947 chunks : mtmd_input_chunk_p ,
926- encoded_embd : POINTER (c_float ),
948+ encoded_embd : POINTER (c_float ), # type: ignore
927949 n_past : c_int32 ,
928950 seq_id : c_int32 ,
929951 n_batch : c_int32 ,
0 commit comments