Od dłuższego czasu zastanawiałem się na stworzenie własnego języka skryptowego bazującego na składni PHP oraz Matlab'owskiej. Już kilka razy do wykonania podchodzę, ale zawsze się zatrzymuję i nie mogę przebrnąć fazy łatwości rozszerzenia. Ostatnio udało mi się stworzyć już tokenizer, który rozbija mi ciąg znaków na Tokeny.
Dla przykładu, ciąg(który zarazem prezentuje składnię języka którą chciałem zaimplementować):
Kod
include("./Main.shl");
a=[10,102,25,854]*10;
if(10<a){
test(a);
}
elseif(a<10){
45;
}
else{
104;
};
a[0] = 11;
[b,c]=test(a);
foreach(a : value){
println(value);
};
switch(te){
case 1:
a[0] =+ 10;
break;
case 2:
a[1] =-10;
default:
println(a);
break;
};
/* komentarz który nie jest brany pod uwagę podczas interpretacji
może on być wieloliniowy
podobnie jak w php */
T = false;
T1 = true;
exit(100);
for(x = 0,y=0;x < 1000;x++,y++){
if(x % 2 == 0){
y += x;
continue;
};
println(y . x);
};
unset(a);
funkcja:(var1,var2) = {
out = var1 + var2;
return out;
}
a[3] = funkcja(a[0],a[1]);
a=[10,102,25,854]*10;
if(10<a){
test(a);
}
elseif(a<10){
45;
}
else{
104;
};
a[0] = 11;
[b,c]=test(a);
foreach(a : value){
println(value);
};
switch(te){
case 1:
a[0] =+ 10;
break;
case 2:
a[1] =-10;
default:
println(a);
break;
};
/* komentarz który nie jest brany pod uwagę podczas interpretacji
może on być wieloliniowy
podobnie jak w php */
T = false;
T1 = true;
exit(100);
for(x = 0,y=0;x < 1000;x++,y++){
if(x % 2 == 0){
y += x;
continue;
};
println(y . x);
};
unset(a);
funkcja:(var1,var2) = {
out = var1 + var2;
return out;
}
a[3] = funkcja(a[0],a[1]);
potrafi mi wstępnie rozbić na następujące Tokeny:
Kod
T_INCLUDE -> 1 | 1 => include
T_LOPARENT -> 1 | 8 => (
T_STRING -> 1 | 9 => ./Main.shl
T_SEMICOLON -> 1 | 22 =>;
T_ALNUM -> 3 | 2 => a
T_OPERATOR -> 3 | 3 => =
T_LSPARENT -> 3 | 4 => [
T_NUMBER -> 3 | 5 => 10
T_COMMA -> 3 | 7 => ,
T_NUMBER -> 3 | 8 => 102
T_COMMA -> 3 | 11 => ,
T_NUMBER -> 3 | 12 => 25
T_COMMA -> 3 | 14 => ,
T_NUMBER -> 3 | 15 => 854
T_OPERATOR -> 3 | 19 => *
T_NUMBER -> 3 | 20 => 10
T_SEMICOLON -> 3 | 22 =>;
T_IF -> 4 | 2 => if
T_LOPARENT -> 4 | 4 => (
T_NUMBER -> 4 | 5 => 10
T_OPERATOR -> 4 | 7 => <
T_ALNUM -> 4 | 8 => a
T_LBRACKET -> 4 | 10 => {
T_ALNUM -> 5 | 3 => test
T_LOPARENT -> 5 | 7 => (
T_ALNUM -> 5 | 8 => a
T_SEMICOLON -> 5 | 10 =>;
T_ELSEIF -> 7 | 2 => elseif
T_LOPARENT -> 7 | 8 => (
T_ALNUM -> 7 | 9 => a
T_OPERATOR -> 7 | 10 => <
T_NUMBER -> 7 | 11 => 10
T_LBRACKET -> 7 | 14 => {
T_NUMBER -> 8 | 3 => 45
T_SEMICOLON -> 8 | 5 =>;
T_ELSE -> 10 | 2 => else
T_LBRACKET -> 10 | 6 => {
T_NUMBER -> 11 | 3 => 104
T_SEMICOLON -> 11 | 6 =>;
T_SEMICOLON -> 12 | 3 =>;
T_ALNUM -> 13 | 2 => a
T_LSPARENT -> 13 | 3 => [
T_NUMBER -> 13 | 4 => 0
T_OPERATOR -> 13 | 7 => =
T_NUMBER -> 13 | 9 => 11
T_SEMICOLON -> 13 | 11 =>;
T_LSPARENT -> 14 | 2 => [
T_ALNUM -> 14 | 3 => b
T_COMMA -> 14 | 4 => ,
T_ALNUM -> 14 | 5 => c
T_OPERATOR -> 14 | 7 => =
T_ALNUM -> 14 | 8 => test
T_LOPARENT -> 14 | 12 => (
T_ALNUM -> 14 | 13 => a
T_SEMICOLON -> 14 | 15 =>;
T_FOREACH -> 16 | 2 => foreach
T_LOPARENT -> 16 | 9 => (
T_ALNUM -> 16 | 10 => a
T_COLON -> 16 | 12 => :
T_ALNUM -> 16 | 14 => value
T_LBRACKET -> 16 | 20 => {
T_ALNUM -> 17 | 3 => println
T_LOPARENT -> 17 | 10 => (
T_ALNUM -> 17 | 11 => value
T_SEMICOLON -> 17 | 17 =>;
T_SEMICOLON -> 18 | 3 =>;
T_SWITCH -> 19 | 2 => switch
T_LOPARENT -> 19 | 8 => (
T_ALNUM -> 19 | 9 => te
T_LBRACKET -> 19 | 12 => {
T_CASE -> 20 | 3 => case
T_NUMBER -> 20 | 8 => 1
T_COLON -> 20 | 9 => :
T_ALNUM -> 21 | 4 => a
T_LSPARENT -> 21 | 5 => [
T_NUMBER -> 21 | 6 => 0
T_OPERATOR -> 21 | 9 => =+
T_NUMBER -> 21 | 12 => 10
T_SEMICOLON -> 21 | 14 =>;
T_BREAK -> 22 | 3 => break
T_SEMICOLON -> 22 | 8 =>;
T_CASE -> 23 | 3 => case
T_NUMBER -> 23 | 8 => 2
T_COLON -> 23 | 9 => :
T_ALNUM -> 24 | 4 => a
T_LSPARENT -> 24 | 5 => [
T_NUMBER -> 24 | 6 => 1
T_OPERATOR -> 24 | 9 => =-
T_NUMBER -> 24 | 11 => 10
T_SEMICOLON -> 24 | 13 =>;
T_DEFAULT -> 25 | 3 => default
T_COLON -> 25 | 10 => :
T_ALNUM -> 26 | 4 => println
T_LOPARENT -> 26 | 11 => (
T_ALNUM -> 26 | 12 => a
T_SEMICOLON -> 26 | 14 =>;
T_BREAK -> 27 | 3 => break
T_SEMICOLON -> 27 | 8 =>;
T_SEMICOLON -> 28 | 3 =>;
T_COMMENT -> 29 | 2 => /* komentarz który nie jest brany pod uwagę podczas interpretacji
może on być wieloliniowy
podobnie jak w php */
T_ALNUM -> 32 | 2 => T
T_OPERATOR -> 32 | 4 => =
T_FALSE -> 32 | 6 => false
T_SEMICOLON -> 32 | 11 =>;
T_ALNUM -> 33 | 2 => T1
T_OPERATOR -> 33 | 5 => =
T_TRUE -> 33 | 7 => true
T_SEMICOLON -> 33 | 11 =>;
T_EXIT -> 35 | 2 => exit
T_LOPARENT -> 35 | 6 => (
T_NUMBER -> 35 | 7 => 100
T_SEMICOLON -> 35 | 11 =>;
T_FOR -> 37 | 2 => for
T_LOPARENT -> 37 | 5 => (
T_ALNUM -> 37 | 6 => x
T_OPERATOR -> 37 | 8 => =
T_NUMBER -> 37 | 10 => 0
T_COMMA -> 37 | 11 => ,
T_ALNUM -> 37 | 12 => y
T_OPERATOR -> 37 | 13 => =
T_NUMBER -> 37 | 14 => 0
T_SEMICOLON -> 37 | 15 =>;
T_ALNUM -> 37 | 16 => x
T_OPERATOR -> 37 | 18 => <
T_NUMBER -> 37 | 20 => 1000
T_SEMICOLON -> 37 | 24 =>;
T_ALNUM -> 37 | 25 => x
T_OPERATOR -> 37 | 26 => ++
T_COMMA -> 37 | 28 => ,
T_ALNUM -> 37 | 29 => y
T_OPERATOR -> 37 | 30 => ++
T_LBRACKET -> 37 | 33 => {
T_IF -> 38 | 3 => if
T_LOPARENT -> 38 | 5 => (
T_ALNUM -> 38 | 6 => x
T_OPERATOR -> 38 | 8 => %
T_NUMBER -> 38 | 10 => 2
T_OPERATOR -> 38 | 12 => ==
T_NUMBER -> 38 | 15 => 0
T_LBRACKET -> 38 | 17 => {
T_ALNUM -> 39 | 4 => y
T_OPERATOR -> 39 | 6 => +=
T_ALNUM -> 39 | 9 => x
T_SEMICOLON -> 39 | 10 =>;
T_CONTINUE -> 40 | 4 => continue
T_SEMICOLON -> 40 | 12 =>;
T_SEMICOLON -> 41 | 4 =>;
T_ALNUM -> 42 | 3 => println
T_LOPARENT -> 42 | 10 => (
T_ALNUM -> 42 | 11 => y
T_OPERATOR -> 42 | 13 => .
T_ALNUM -> 42 | 15 => x
T_SEMICOLON -> 42 | 17 =>;
T_SEMICOLON -> 43 | 3 =>;
T_UNSET -> 44 | 2 => unset
T_LOPARENT -> 44 | 7 => (
T_ALNUM -> 44 | 8 => a
T_SEMICOLON -> 44 | 10 =>;
T_ALNUM -> 46 | 2 => funkcja
T_COLON -> 46 | 9 => :
T_LOPARENT -> 46 | 10 => (
T_ALNUM -> 46 | 11 => var1
T_COMMA -> 46 | 15 => ,
T_ALNUM -> 46 | 16 => var2
T_OPERATOR -> 46 | 22 => =
T_LBRACKET -> 46 | 24 => {
T_ALNUM -> 47 | 3 => out
T_OPERATOR -> 47 | 7 => =
T_ALNUM -> 47 | 9 => var1
T_OPERATOR -> 47 | 14 => +
T_ALNUM -> 47 | 16 => var2
T_SEMICOLON -> 47 | 20 =>;
T_RETURN -> 48 | 3 => return
T_ALNUM -> 48 | 10 => out
T_SEMICOLON -> 48 | 13 =>;
T_ALNUM -> 51 | 2 => a
T_LSPARENT -> 51 | 3 => [
T_NUMBER -> 51 | 4 => 3
T_OPERATOR -> 51 | 7 => =
T_ALNUM -> 51 | 9 => funkcja
T_LOPARENT -> 51 | 16 => (
T_ALNUM -> 51 | 17 => a
T_LSPARENT -> 51 | 18 => [
T_NUMBER -> 51 | 19 => 0
T_COMMA -> 51 | 21 => ,
T_ALNUM -> 51 | 22 => a
T_LSPARENT -> 51 | 23 => [
T_NUMBER -> 51 | 24 => 1
T_SEMICOLON -> 51 | 27 =>;
T_LOPARENT -> 1 | 8 => (
T_STRING -> 1 | 9 => ./Main.shl
T_SEMICOLON -> 1 | 22 =>;
T_ALNUM -> 3 | 2 => a
T_OPERATOR -> 3 | 3 => =
T_LSPARENT -> 3 | 4 => [
T_NUMBER -> 3 | 5 => 10
T_COMMA -> 3 | 7 => ,
T_NUMBER -> 3 | 8 => 102
T_COMMA -> 3 | 11 => ,
T_NUMBER -> 3 | 12 => 25
T_COMMA -> 3 | 14 => ,
T_NUMBER -> 3 | 15 => 854
T_OPERATOR -> 3 | 19 => *
T_NUMBER -> 3 | 20 => 10
T_SEMICOLON -> 3 | 22 =>;
T_IF -> 4 | 2 => if
T_LOPARENT -> 4 | 4 => (
T_NUMBER -> 4 | 5 => 10
T_OPERATOR -> 4 | 7 => <
T_ALNUM -> 4 | 8 => a
T_LBRACKET -> 4 | 10 => {
T_ALNUM -> 5 | 3 => test
T_LOPARENT -> 5 | 7 => (
T_ALNUM -> 5 | 8 => a
T_SEMICOLON -> 5 | 10 =>;
T_ELSEIF -> 7 | 2 => elseif
T_LOPARENT -> 7 | 8 => (
T_ALNUM -> 7 | 9 => a
T_OPERATOR -> 7 | 10 => <
T_NUMBER -> 7 | 11 => 10
T_LBRACKET -> 7 | 14 => {
T_NUMBER -> 8 | 3 => 45
T_SEMICOLON -> 8 | 5 =>;
T_ELSE -> 10 | 2 => else
T_LBRACKET -> 10 | 6 => {
T_NUMBER -> 11 | 3 => 104
T_SEMICOLON -> 11 | 6 =>;
T_SEMICOLON -> 12 | 3 =>;
T_ALNUM -> 13 | 2 => a
T_LSPARENT -> 13 | 3 => [
T_NUMBER -> 13 | 4 => 0
T_OPERATOR -> 13 | 7 => =
T_NUMBER -> 13 | 9 => 11
T_SEMICOLON -> 13 | 11 =>;
T_LSPARENT -> 14 | 2 => [
T_ALNUM -> 14 | 3 => b
T_COMMA -> 14 | 4 => ,
T_ALNUM -> 14 | 5 => c
T_OPERATOR -> 14 | 7 => =
T_ALNUM -> 14 | 8 => test
T_LOPARENT -> 14 | 12 => (
T_ALNUM -> 14 | 13 => a
T_SEMICOLON -> 14 | 15 =>;
T_FOREACH -> 16 | 2 => foreach
T_LOPARENT -> 16 | 9 => (
T_ALNUM -> 16 | 10 => a
T_COLON -> 16 | 12 => :
T_ALNUM -> 16 | 14 => value
T_LBRACKET -> 16 | 20 => {
T_ALNUM -> 17 | 3 => println
T_LOPARENT -> 17 | 10 => (
T_ALNUM -> 17 | 11 => value
T_SEMICOLON -> 17 | 17 =>;
T_SEMICOLON -> 18 | 3 =>;
T_SWITCH -> 19 | 2 => switch
T_LOPARENT -> 19 | 8 => (
T_ALNUM -> 19 | 9 => te
T_LBRACKET -> 19 | 12 => {
T_CASE -> 20 | 3 => case
T_NUMBER -> 20 | 8 => 1
T_COLON -> 20 | 9 => :
T_ALNUM -> 21 | 4 => a
T_LSPARENT -> 21 | 5 => [
T_NUMBER -> 21 | 6 => 0
T_OPERATOR -> 21 | 9 => =+
T_NUMBER -> 21 | 12 => 10
T_SEMICOLON -> 21 | 14 =>;
T_BREAK -> 22 | 3 => break
T_SEMICOLON -> 22 | 8 =>;
T_CASE -> 23 | 3 => case
T_NUMBER -> 23 | 8 => 2
T_COLON -> 23 | 9 => :
T_ALNUM -> 24 | 4 => a
T_LSPARENT -> 24 | 5 => [
T_NUMBER -> 24 | 6 => 1
T_OPERATOR -> 24 | 9 => =-
T_NUMBER -> 24 | 11 => 10
T_SEMICOLON -> 24 | 13 =>;
T_DEFAULT -> 25 | 3 => default
T_COLON -> 25 | 10 => :
T_ALNUM -> 26 | 4 => println
T_LOPARENT -> 26 | 11 => (
T_ALNUM -> 26 | 12 => a
T_SEMICOLON -> 26 | 14 =>;
T_BREAK -> 27 | 3 => break
T_SEMICOLON -> 27 | 8 =>;
T_SEMICOLON -> 28 | 3 =>;
T_COMMENT -> 29 | 2 => /* komentarz który nie jest brany pod uwagę podczas interpretacji
może on być wieloliniowy
podobnie jak w php */
T_ALNUM -> 32 | 2 => T
T_OPERATOR -> 32 | 4 => =
T_FALSE -> 32 | 6 => false
T_SEMICOLON -> 32 | 11 =>;
T_ALNUM -> 33 | 2 => T1
T_OPERATOR -> 33 | 5 => =
T_TRUE -> 33 | 7 => true
T_SEMICOLON -> 33 | 11 =>;
T_EXIT -> 35 | 2 => exit
T_LOPARENT -> 35 | 6 => (
T_NUMBER -> 35 | 7 => 100
T_SEMICOLON -> 35 | 11 =>;
T_FOR -> 37 | 2 => for
T_LOPARENT -> 37 | 5 => (
T_ALNUM -> 37 | 6 => x
T_OPERATOR -> 37 | 8 => =
T_NUMBER -> 37 | 10 => 0
T_COMMA -> 37 | 11 => ,
T_ALNUM -> 37 | 12 => y
T_OPERATOR -> 37 | 13 => =
T_NUMBER -> 37 | 14 => 0
T_SEMICOLON -> 37 | 15 =>;
T_ALNUM -> 37 | 16 => x
T_OPERATOR -> 37 | 18 => <
T_NUMBER -> 37 | 20 => 1000
T_SEMICOLON -> 37 | 24 =>;
T_ALNUM -> 37 | 25 => x
T_OPERATOR -> 37 | 26 => ++
T_COMMA -> 37 | 28 => ,
T_ALNUM -> 37 | 29 => y
T_OPERATOR -> 37 | 30 => ++
T_LBRACKET -> 37 | 33 => {
T_IF -> 38 | 3 => if
T_LOPARENT -> 38 | 5 => (
T_ALNUM -> 38 | 6 => x
T_OPERATOR -> 38 | 8 => %
T_NUMBER -> 38 | 10 => 2
T_OPERATOR -> 38 | 12 => ==
T_NUMBER -> 38 | 15 => 0
T_LBRACKET -> 38 | 17 => {
T_ALNUM -> 39 | 4 => y
T_OPERATOR -> 39 | 6 => +=
T_ALNUM -> 39 | 9 => x
T_SEMICOLON -> 39 | 10 =>;
T_CONTINUE -> 40 | 4 => continue
T_SEMICOLON -> 40 | 12 =>;
T_SEMICOLON -> 41 | 4 =>;
T_ALNUM -> 42 | 3 => println
T_LOPARENT -> 42 | 10 => (
T_ALNUM -> 42 | 11 => y
T_OPERATOR -> 42 | 13 => .
T_ALNUM -> 42 | 15 => x
T_SEMICOLON -> 42 | 17 =>;
T_SEMICOLON -> 43 | 3 =>;
T_UNSET -> 44 | 2 => unset
T_LOPARENT -> 44 | 7 => (
T_ALNUM -> 44 | 8 => a
T_SEMICOLON -> 44 | 10 =>;
T_ALNUM -> 46 | 2 => funkcja
T_COLON -> 46 | 9 => :
T_LOPARENT -> 46 | 10 => (
T_ALNUM -> 46 | 11 => var1
T_COMMA -> 46 | 15 => ,
T_ALNUM -> 46 | 16 => var2
T_OPERATOR -> 46 | 22 => =
T_LBRACKET -> 46 | 24 => {
T_ALNUM -> 47 | 3 => out
T_OPERATOR -> 47 | 7 => =
T_ALNUM -> 47 | 9 => var1
T_OPERATOR -> 47 | 14 => +
T_ALNUM -> 47 | 16 => var2
T_SEMICOLON -> 47 | 20 =>;
T_RETURN -> 48 | 3 => return
T_ALNUM -> 48 | 10 => out
T_SEMICOLON -> 48 | 13 =>;
T_ALNUM -> 51 | 2 => a
T_LSPARENT -> 51 | 3 => [
T_NUMBER -> 51 | 4 => 3
T_OPERATOR -> 51 | 7 => =
T_ALNUM -> 51 | 9 => funkcja
T_LOPARENT -> 51 | 16 => (
T_ALNUM -> 51 | 17 => a
T_LSPARENT -> 51 | 18 => [
T_NUMBER -> 51 | 19 => 0
T_COMMA -> 51 | 21 => ,
T_ALNUM -> 51 | 22 => a
T_LSPARENT -> 51 | 23 => [
T_NUMBER -> 51 | 24 => 1
T_SEMICOLON -> 51 | 27 =>;
No i własnie w tym momencie teraz się zatrzymałem. Proszę Was przez to o pomoc w jakimś naprowadzeniu, bo chcę doprowadzić to do końca. Zmienne będą deklarowane na bieżąco podczas pierwszego użycia(o ile będzie ono poprawne) i zapisywane razem z ich typami.
Myślałem teraz w tym momencie, mają tablicę takich Tokenów stworzyć drzewo AST ale jakaś blokada się mi włączyła i nie mogę ruszyć z miejsca. Operatory będą jako osobna klasa uwzględniająca priorytety, łączności i ilości parametrów. Zamierzałem podzielić na 2 typy: unarne oraz binarne.
Rozbijanie na tokeny można sprawdzić w tym miejscu:
Tokenizer
Docelowo mam zamiar napisać interpretery do tego języka jeszcze w c++ oraz w javie, ale jak narazie to odległa przyszłość.
Jeśli coś jest niejasne to pisać, postaram się w miarę wytłumaczyć mój zamysł.
EDIT:
Przepraszam za błąd ortoraficzny, jest mi naprawdę przykro z tego powodu. Staram się pisać poprawnie i jest mi niezmiernie głupio że popełniłem taki lapsus...